From 4876c278ac9a847f32454557282c8b9e8c6216b6 Mon Sep 17 00:00:00 2001
From: Daniele Grattarola <daniele.grattarola@gmail.com>
Date: Wed, 28 Oct 2020 17:31:34 +0100
Subject: [PATCH 01/57] Create spektral.data module.

This module adds abstractions to represent graphs and graph datasets, as well as data loaders for automatically feeding batches of graphs to a Model (disjoint and batch mode are supported for now).
The commit adds:
- Graph, to represent graphs
- Dataset, to represent graph datasets
- Loader, BatchLoader, DisjointLoader, to feed batches of data in the corresponding formats. If TF 2.4 is installed, the Loaders can be passed directly to Model.fit() by calling Loader.tf().

spektral.utils.data is now merged in spektral.data.utils.
This commit also adds tests for the new module.
---
 docs/autogen.py                               |   7 +-
 examples/graph_prediction/BDGC_disjoint.py    |   8 +-
 .../graph_prediction/ogbg-mol-hiv_disjoint.py |   8 +-
 examples/graph_prediction/qm9_disjoint.py     |   8 +-
 examples/graph_prediction/tud_disjoint.py     |   8 +-
 spektral/data/__init__.py                     |   3 +
 spektral/data/dataset.py                      | 131 ++++++++++++++++
 spektral/data/graph.py                        |  50 ++++++
 spektral/data/loaders.py                      | 105 +++++++++++++
 spektral/data/utils.py                        | 146 ++++++++++++++++++
 spektral/utils/data.py                        | 103 ------------
 tests/data/test_dataset.py                    |  65 ++++++++
 tests/data/test_graph.py                      |  37 +++++
 tests/data/test_loaders.py                    |  53 +++++++
 tests/data/test_utils.py                      |  46 ++++++
 tests/test_utils.py                           |  20 ---
 16 files changed, 656 insertions(+), 142 deletions(-)
 create mode 100644 spektral/data/__init__.py
 create mode 100644 spektral/data/dataset.py
 create mode 100644 spektral/data/graph.py
 create mode 100644 spektral/data/loaders.py
 create mode 100644 spektral/data/utils.py
 create mode 100644 tests/data/test_dataset.py
 create mode 100644 tests/data/test_graph.py
 create mode 100644 tests/data/test_loaders.py
 create mode 100644 tests/data/test_utils.py
 delete mode 100644 tests/test_utils.py

diff --git a/docs/autogen.py b/docs/autogen.py
index d368aef9..0086b5dd 100644
--- a/docs/autogen.py
+++ b/docs/autogen.py
@@ -8,6 +8,7 @@
 import shutil
 import sys
 
+import spektral.data.utils
 from spektral import chem
 from spektral import datasets
 from spektral import layers
@@ -142,9 +143,9 @@
     {
         'page': 'utils/data.md',
         'functions': [
-            utils.data.numpy_to_disjoint,
-            utils.data.numpy_to_batch,
-            utils.data.batch_iterator
+            spektral.data.utils.numpy_to_disjoint,
+            spektral.data.utils.numpy_to_batch,
+            spektral.data.utils.batch_generator
         ]
     },
     {
diff --git a/examples/graph_prediction/BDGC_disjoint.py b/examples/graph_prediction/BDGC_disjoint.py
index 1e3c0e1f..b00bed9f 100644
--- a/examples/graph_prediction/BDGC_disjoint.py
+++ b/examples/graph_prediction/BDGC_disjoint.py
@@ -21,11 +21,11 @@
 from spektral.layers import ops
 from spektral.layers.pooling import TopKPool
 from spektral.utils.convolution import normalized_adjacency
-from spektral.utils.data import batch_iterator, numpy_to_disjoint
+from spektral.data.utils import numpy_to_disjoint, batch_generator
 
 
 def evaluate(A_list, X_list, y_list, ops_list, batch_size):
-    batches = batch_iterator([X_list, A_list, y_list], batch_size=batch_size)
+    batches = batch_generator([X_list, A_list, y_list], batch_size=batch_size)
     output = []
     for b in batches:
         X, A, I = numpy_to_disjoint(*b[:-1])
@@ -124,8 +124,8 @@ def train_step(X_, A_, I_, y_):
 batches_in_epoch = np.ceil(y_train.shape[0] / batch_size)
 
 print('Fitting model')
-batches = batch_iterator([X_train, A_train, y_train],
-                         batch_size=batch_size, epochs=epochs)
+batches = batch_generator([X_train, A_train, y_train],
+                          batch_size=batch_size, epochs=epochs)
 for b in batches:
     current_batch += 1
 
diff --git a/examples/graph_prediction/ogbg-mol-hiv_disjoint.py b/examples/graph_prediction/ogbg-mol-hiv_disjoint.py
index 42a66be1..f891841c 100644
--- a/examples/graph_prediction/ogbg-mol-hiv_disjoint.py
+++ b/examples/graph_prediction/ogbg-mol-hiv_disjoint.py
@@ -16,7 +16,7 @@
 
 from spektral.datasets import ogb
 from spektral.layers import EdgeConditionedConv, ops, GlobalSumPool
-from spektral.utils import batch_iterator, numpy_to_disjoint
+from spektral.data.utils import numpy_to_disjoint, batch_generator
 
 ################################################################################
 # PARAMETERS
@@ -86,8 +86,8 @@ def train_step(X_, A_, E_, I_, y_):
 batches_in_epoch = np.ceil(len(A_tr) / batch_size)
 
 print('Fitting model')
-batches_train = batch_iterator([X_tr, A_tr, E_tr, y_tr],
-                               batch_size=batch_size, epochs=epochs)
+batches_train = batch_generator([X_tr, A_tr, E_tr, y_tr],
+                                batch_size=batch_size, epochs=epochs)
 for b in batches_train:
     X_, A_, E_, I_ = numpy_to_disjoint(*b[:-1])
     A_ = ops.sp_matrix_to_sp_tensor(A_)
@@ -107,7 +107,7 @@ def train_step(X_, A_, E_, I_, y_):
 print('Testing model')
 evaluator = Evaluator(name=dataset_name)
 y_pred = []
-batches_test = batch_iterator([X_te, A_te, E_te], batch_size=batch_size)
+batches_test = batch_generator([X_te, A_te, E_te], batch_size=batch_size)
 for b in batches_test:
     X_, A_, E_, I_ = numpy_to_disjoint(*b)
     A_ = ops.sp_matrix_to_sp_tensor(A_)
diff --git a/examples/graph_prediction/qm9_disjoint.py b/examples/graph_prediction/qm9_disjoint.py
index 9c681623..923a1acb 100644
--- a/examples/graph_prediction/qm9_disjoint.py
+++ b/examples/graph_prediction/qm9_disjoint.py
@@ -13,7 +13,7 @@
 
 from spektral.datasets import qm9
 from spektral.layers import EdgeConditionedConv, ops, GlobalSumPool
-from spektral.utils import batch_iterator, numpy_to_disjoint
+from spektral.data.utils import numpy_to_disjoint, batch_generator
 from spektral.utils import label_to_one_hot
 
 ################################################################################
@@ -98,8 +98,8 @@ def train_step(X_, A_, E_, I_, y_):
 batches_in_epoch = np.ceil(len(A_train) / batch_size)
 
 print('Fitting model')
-batches_train = batch_iterator([X_train, A_train, E_train, y_train],
-                               batch_size=batch_size, epochs=epochs)
+batches_train = batch_generator([X_train, A_train, E_train, y_train],
+                                batch_size=batch_size, epochs=epochs)
 for b in batches_train:
     X_, A_, E_, I_ = numpy_to_disjoint(*b[:-1])
     A_ = ops.sp_matrix_to_sp_tensor(A_)
@@ -119,7 +119,7 @@ def train_step(X_, A_, E_, I_, y_):
 print('Testing model')
 model_loss = 0
 batches_in_epoch = np.ceil(len(A_test) / batch_size)
-batches_test = batch_iterator([X_test, A_test, E_test, y_test], batch_size=batch_size)
+batches_test = batch_generator([X_test, A_test, E_test, y_test], batch_size=batch_size)
 for b in batches_test:
     X_, A_, E_, I_ = numpy_to_disjoint(*b[:-1])
     A_ = ops.sp_matrix_to_sp_tensor(A_)
diff --git a/examples/graph_prediction/tud_disjoint.py b/examples/graph_prediction/tud_disjoint.py
index 32db8fe0..dd6c47da 100644
--- a/examples/graph_prediction/tud_disjoint.py
+++ b/examples/graph_prediction/tud_disjoint.py
@@ -15,7 +15,7 @@
 
 from spektral.datasets import tud
 from spektral.layers import GINConv, GlobalAvgPool, ops
-from spektral.utils import batch_iterator, numpy_to_disjoint
+from spektral.data.utils import numpy_to_disjoint, batch_generator
 
 ################################################################################
 # PARAMETERS
@@ -100,8 +100,8 @@ def train_step(x_, a_, i_, y_):
 batches_in_epoch = np.ceil(len(a_train) / batch_size)
 
 print('Fitting model')
-batches_train = batch_iterator([x_train, a_train, y_train],
-                               batch_size=batch_size, epochs=epochs)
+batches_train = batch_generator([x_train, a_train, y_train],
+                                batch_size=batch_size, epochs=epochs)
 for b in batches_train:
     x_, a_, i_ = numpy_to_disjoint(*b[:-1])
     a_ = ops.sp_matrix_to_sp_tensor(a_)
@@ -124,7 +124,7 @@ def train_step(x_, a_, i_, y_):
 print('Testing model')
 model_lss = model_acc = 0
 batches_in_epoch = np.ceil(len(a_test) / batch_size)
-batches_test = batch_iterator([x_test, a_test, y_test], batch_size=batch_size)
+batches_test = batch_generator([x_test, a_test, y_test], batch_size=batch_size)
 for b in batches_test:
     x_, a_, i_ = numpy_to_disjoint(*b[:-1])
     a_ = ops.sp_matrix_to_sp_tensor(a_)
diff --git a/spektral/data/__init__.py b/spektral/data/__init__.py
new file mode 100644
index 00000000..7b52a983
--- /dev/null
+++ b/spektral/data/__init__.py
@@ -0,0 +1,3 @@
+from .graph import Graph
+from .dataset import Dataset
+from .loaders import Loader, BatchLoader, DisjointLoader
diff --git a/spektral/data/dataset.py b/spektral/data/dataset.py
new file mode 100644
index 00000000..18855307
--- /dev/null
+++ b/spektral/data/dataset.py
@@ -0,0 +1,131 @@
+import copy
+
+import tensorflow as tf
+
+from spektral.data import Graph
+from spektral.data.utils import get_spec
+
+
+class Dataset:
+    """
+    A container for Graph objects. This class can be extended to represent a
+    graph dataset.
+
+    To extend this class, you must implement the `Dataset.read()` method, which
+    must return a list of `spektral.data.Graph` objects, e.g.,
+
+    ```
+    class MyDataset(Dataset):
+        def read(self):
+            return [
+                Graph(x=np.random.rand(n, 2),
+                      adj=np.random.randint(0, 2, (n, n)),
+                      y=np.array([0., 1.]))
+                for n in range(size)
+            ]
+    ```
+
+    Datasets can be sliced (`dataset[start:stop]`), shuffled
+    (`np.random.shuffle(dataset)`), and iterated (`for graph in dataset: ...`).
+
+    The size of the node features, edge features and targets is shared by all
+    graphs in a dataset and can be accessed respectively with:
+
+    ```
+    >>> dataset.F
+    >>> dataset.S
+    >>> dataset.n_out
+    ```
+
+    The general shape, dtype, and `tf.TypeSpec` of the matrices composing the
+    graphs is stored in `dataset.signature`. This can be useful when
+    implementing a custom Loader for your dataset.
+    """
+    def __init__(self, **kwargs):
+        self.graphs = self.read()
+        # Make sure that we always have at least one graph
+        if len(self.graphs) == 0:
+            raise ValueError('Datasets cannot be empty')
+        self.F = None
+        self.S = None
+        self.n_out = None
+        self.signature = self._signature()
+
+        # Read extra kwargs
+        for k, v in kwargs.items():
+            setattr(self, k, v)
+
+    def read(self):
+        raise NotImplementedError
+
+    def _signature(self):
+        signature = {}
+        graph = self.graphs[0]  # This is always non-empty
+        if graph.x is not None:
+            signature['x'] = dict()
+            signature['x']['spec'] = get_spec(graph.x)
+            signature['x']['shape'] = (None, graph.F)
+            signature['x']['dtype'] = tf.as_dtype(graph.x.dtype)
+            self.F = graph.F
+        if graph.adj is not None:
+            signature['a'] = dict()
+            signature['a']['spec'] = get_spec(graph.adj)
+            signature['a']['shape'] = (None, None)
+            signature['a']['dtype'] = tf.as_dtype(graph.adj.dtype)
+        if graph.edge_attr is not None:
+            signature['e'] = dict()
+            signature['e']['spec'] = get_spec(graph.edge_attr)
+            signature['e']['shape'] = (None, graph.S)
+            signature['e']['dtype'] = tf.as_dtype(graph.edge_attr.dtype)
+            self.S = graph.S
+        if graph.y is not None:
+            signature['y'] = dict()
+            signature['y']['spec'] = get_spec(graph.y)
+            signature['y']['shape'] = (graph.y.shape[-1], )
+            signature['y']['dtype'] = tf.as_dtype(graph.y.dtype)
+            self.n_out = graph.y.shape[-1]
+
+        return signature
+
+    def __getitem__(self, key):
+        if not (isinstance(key, (int, slice, list, tuple))):
+            raise ValueError('Unsupported key type: {}'.format(type(key)))
+        if isinstance(key, int):
+            return self.graphs[key]
+        else:
+            dataset = copy.copy(self)
+            if isinstance(key, slice):
+                dataset.graphs = self.graphs[key]
+            else:
+                dataset.graphs = [self.graphs[i] for i in key]
+            return dataset
+
+    def __setitem__(self, key, value):
+        is_iterable = isinstance(value, (list, tuple))
+        if not isinstance(value, (Graph, list, tuple)):
+            raise ValueError('Datasets can only be assigned Graphs or '
+                             'sequences of Graphs')
+        if is_iterable and not all([isinstance(v, Graph) for v in value]):
+            raise ValueError('Assigned sequence must contain only Graphs')
+        if is_iterable and isinstance(key, int):
+            raise ValueError('Cannot assign multiple Graphs to one location')
+        if not is_iterable and isinstance(key, (slice, list, tuple)):
+            raise ValueError('Cannot assign one Graph to multiple locations')
+        if not (isinstance(key, (int, slice, list, tuple))):
+            raise ValueError('Unsupported key type: {}'.format(type(key)))
+
+        if isinstance(key, int):
+            self.graphs[key] = value
+        else:
+            if isinstance(key, slice):
+                self.graphs[key] = value
+            else:
+                for i, k in enumerate(key):
+                    self.graphs[k] = value[i]
+
+    def __len__(self):
+        return len(self.graphs)
+
+    def __repr__(self):
+        return 'Dataset(len={}, signature="{}")'\
+            .format(self.__len__(), ', '.join(self.signature.keys()))
\ No newline at end of file
diff --git a/spektral/data/graph.py b/spektral/data/graph.py
new file mode 100644
index 00000000..23cd7ff7
--- /dev/null
+++ b/spektral/data/graph.py
@@ -0,0 +1,50 @@
+class Graph:
+    """
+    A container to represent a graph with:
+        - node features;
+        - adjacency matrix;
+        - edge attributes;
+        - node or graph labels;
+
+    See the [data representation page](https://graphneural.network/data/) for
+    more info.
+
+    This class exposes the following attributes:
+
+    - `N`: number of nodes;
+    - `F`: size of the node features;
+    - `S`: size of the edge features;
+
+    **Arguments**
+
+    - `x`: np.array, the node features (shape `(N, F)`);
+    - `adj`: np.array or scipy.sparse matrix, the adjacency matrix (shape `(N, N)`);
+    - `edge_attr`: np.array, the edge features (shape `(N, N, S)`);
+    - `y`: np.array, the node or graph labels (shape `(N, n_labels)` or
+           `(n_labels, )`);
+
+
+    """
+    def __init__(self, x=None, adj=None, edge_attr=None, y=None, **kwargs):
+        self.x = x
+        self.adj = adj
+        self.edge_attr = edge_attr
+        self.y = y
+        # Read extra kwargs
+        for k, v in kwargs.items():
+            self[k] = v
+
+        self.N = None if self.x is None else self.x.shape[-2]
+        self.F = None if self.x is None else self.x.shape[-1]
+        self.S = None if self.edge_attr is None else self.edge_attr.shape[-1]
+
+    def numpy(self):
+        return tuple(ret for ret in [self.x, self.adj, self.edge_attr, self.y]
+                     if ret is not None)
+
+    def __getitem__(self, key):
+        return getattr(self, key, None)
+
+    def __repr__(self):
+        return 'Graph(N={}, F={}, S={}, y={}'\
+               .format(self.N, self.F, self.S, self.y)
\ No newline at end of file
diff --git a/spektral/data/loaders.py b/spektral/data/loaders.py
new file mode 100644
index 00000000..d92905d2
--- /dev/null
+++ b/spektral/data/loaders.py
@@ -0,0 +1,105 @@
+import copy
+
+import numpy as np
+import tensorflow as tf
+from scipy import sparse as sp
+
+from spektral.data.utils import prepend_none, output_signature, numpy_to_disjoint, numpy_to_batch, batch_generator
+from spektral.layers.ops import sp_matrix_to_sp_tensor
+
+version = tf.__version__.split('.')
+major, minor = int(version[0]), int(version[1])
+tf_loader_available = major > 2 and minor > 3
+
+
+class Loader:
+    def __init__(self, dataset, batch_size=1, epochs=1, shuffle=False):
+        self.dataset = dataset
+        self.batch_size = batch_size
+        self.epochs = epochs
+        self.shuffle = shuffle
+        self._generator = batch_generator(
+            self.dataset, batch_size=self.batch_size, epochs=self.epochs,
+            shuffle=self.shuffle)
+        self.steps_per_epoch = int(np.ceil(len(self.dataset) / self.batch_size))
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        nxt = self._generator.__next__()
+        return self.collate(nxt)
+
+    def tf(self):
+        raise NotImplementedError
+
+    def collate(self, data_list):
+        raise NotImplementedError
+
+    def _pack(self, data_list):
+        return [list(elem) for elem in zip(*[g.numpy() for g in data_list])]
+
+
+class BatchLoader(Loader):
+    def tf(self):
+        if not tf_loader_available:
+            raise RuntimeError('Calling Loader.tf() requires TensorFlow 2.4 '
+                               'or greater.')
+        signature = copy.deepcopy(self.dataset.signature)
+        for k in signature:
+            signature[k]['shape'] = prepend_none(signature[k]['shape'])
+        if 'a' in signature:
+            # Adjacency matrix in batch mode is dense
+            signature['a']['spec'] = tf.TensorSpec
+        if 'e' in signature:
+            # Edge attributes have an extra None dimension in batch mode
+            signature['e']['shape'] = prepend_none(signature['e']['shape'])
+
+        return tf.data.Dataset.from_generator(
+            lambda: (_ for _ in self),
+            output_signature=output_signature(signature)
+        )
+
+    def collate(self, data_list):
+        data_packed = self._pack(data_list)
+        y = np.array(data_packed[-1])
+        ret = numpy_to_batch(*data_packed[:-1])
+
+        return ret, y
+
+
+class DisjointLoader(Loader):
+    def tf(self):
+        if not tf_loader_available:
+            raise RuntimeError('Calling Loader.tf() requires TensorFlow 2.4 '
+                               'or greater.')
+        signature = copy.deepcopy(self.dataset.signature)
+        if 'y' in signature:
+            # Edge attributes have an extra None dimension in batch mode
+            signature['y']['shape'] = prepend_none(signature['y']['shape'])
+
+        if 'a' in signature:
+            # Adjacency matrix in batch mode is sparse
+            signature['a']['spec'] = tf.SparseTensorSpec
+
+        signature['i'] = dict()
+        signature['i']['spec'] = tf.TensorSpec
+        signature['i']['shape'] = (None, )
+        signature['i']['dtype'] = tf.as_dtype(tf.int64)
+
+        return tf.data.Dataset.from_generator(
+            lambda: (_ for _ in self),
+            output_signature=output_signature(signature)
+        )
+
+    def collate(self, data_list):
+        data_packed = self._pack(data_list)
+        y = np.array(data_packed[-1])
+        ret = numpy_to_disjoint(*data_packed[:-1])
+        ret = list(ret)
+        for i in range(len(ret)):
+            if sp.issparse(ret[i]):
+                ret[i] = sp_matrix_to_sp_tensor(ret[i])
+        ret = tuple(ret)
+
+        return ret, y
\ No newline at end of file
diff --git a/spektral/data/utils.py b/spektral/data/utils.py
new file mode 100644
index 00000000..02af2083
--- /dev/null
+++ b/spektral/data/utils.py
@@ -0,0 +1,146 @@
+import numpy as np
+import tensorflow as tf
+from scipy import sparse as sp
+
+from spektral.utils import pad_jagged_array
+
+
+def numpy_to_disjoint(x_list, a_list, e_list=None):
+    """
+    Converts lists of node features, adjacency matrices and (optionally) edge
+    features to [disjoint mode](https://danielegrattarola.github.io/spektral/data/#disjoint-mode).
+
+    The i-th element of each list must be associated with the i-th graph.
+
+    The method also computes the batch index to retrieve individual graphs
+    from the disjoint union.
+
+    :param x_list: a list of np.arrays of shape `(N, F)` -- note that `N` can
+    change between graphs;
+    :param a_list: a list of np.arrays or scipy.sparse matrices of shape
+    `(N, N)`;
+    :param e_list: a list of np.arrays of shape `(N, N, S)`;
+    :return:
+        -  `x`: np.array of shape `(n_nodes, F)`;
+        -  `a`: scipy.sparse matrix of shape `(n_nodes, n_nodes)`;
+        -  `e`: (only if `e_list` is given) np.array of shape `(n_edges, S)`;
+        -  `i`: np.array of shape `(n_nodes, )`;
+    """
+    x_out = np.vstack(x_list)
+    a_list = [sp.coo_matrix(a) for a in a_list]
+    if e_list is not None:
+        if e_list[0].ndim == 3:
+            e_list = [e[a.row, a.col] for e, a in zip(e_list, a_list)]
+        e_out = np.vstack(e_list)
+    a_out = sp.block_diag(a_list)
+    n_nodes = np.array([x.shape[0] for x in x_list])
+    i_out = np.repeat(np.arange(len(n_nodes)), n_nodes)
+    if e_list is not None:
+        return x_out, a_out, e_out, i_out
+    else:
+        return x_out, a_out, i_out
+
+
+def numpy_to_batch(x_list, a_list, e_list=None):
+    """
+    Converts lists of node features, adjacency matrices and (optionally) edge 
+    features to [batch mode](https://danielegrattarola.github.io/spektral/data/#batch-mode),
+    by zero-padding all X, A and E matrices to have the same node dimensions.
+
+    The i-th element of each list must be associated with the i-th graph.
+
+    Note that if `a_list` contains sparse matrices, they will be converted to
+    dense np.arrays, which can be memory-expensive.
+
+    :param x_list: a list of np.arrays of shape `(N, F)` -- note that `N` can
+    change between graphs;
+    :param a_list: a list of np.arrays or scipy.sparse matrices of shape
+    `(N, N)`;
+    :param e_list: a list of np.arrays of shape `(N, N, S)`;
+    :return:
+        -  `x`: np.array of shape `(batch, n_max, F)`;
+        -  `a`: np.array of shape `(batch, n_max, n_max)`;
+        -  `e`: (only if `e_list` is given) np.array of shape
+        `(batch, n_max, n_max, S)`;
+    """
+    n_max = max([a.shape[-1] for a in a_list])
+    x_out = pad_jagged_array(x_list, (n_max, -1))
+    # Convert sparse matrices to dense
+    if hasattr(a_list[0], 'toarray'):
+        a_list = [a.toarray() for a in a_list]
+    a_out = pad_jagged_array(a_list, (n_max, n_max))
+    if e_list is not None:
+        e_out = pad_jagged_array(e_list, (n_max, n_max, -1))
+        return x_out, a_out, e_out
+    else:
+        return x_out, a_out
+
+
+def batch_generator(data, batch_size=32, epochs=1, shuffle=True):
+    """
+    Iterates over the data for the given number of epochs, yielding batches of
+    size `batch_size`.
+    :param data: np.array or list of np.arrays with the same first dimension;
+    :param batch_size: number of samples in a batch;
+    :param epochs: number of times to iterate over the data;
+    :param shuffle: whether to shuffle the data at the beginning of each epoch
+    :return: batches of size `batch_size`.
+    """
+    if not isinstance(data, list):
+        data = [data]
+    if len(data) < 1:
+        raise ValueError('data cannot be empty')
+    if len(set([len(item) for item in data])) > 1:
+        raise ValueError('All inputs must have the same __len__')
+
+    len_data = len(data[0])
+    batches_per_epoch = int(np.ceil(len_data / batch_size))
+    for epochs in range(epochs):
+        if shuffle:
+            shuffle_inplace(*data)
+        for batch in range(batches_per_epoch):
+            start = batch * batch_size
+            stop = min(start + batch_size, len_data)
+
+            to_yield = [item[start:stop] for item in data]
+            if len(data) == 1:
+                to_yield = to_yield[0]
+
+            yield to_yield
+
+
+def shuffle_inplace(*args):
+    rng_state = np.random.get_state()
+    for a in args:
+        np.random.set_state(rng_state)
+        np.random.shuffle(a)
+
+
+def get_spec(x):
+    if isinstance(x, tf.SparseTensor) or sp.issparse(x):
+        return tf.SparseTensorSpec
+    else:
+        return tf.TensorSpec
+
+
+def prepend_none(t):
+    return (None, ) + t
+
+
+def output_signature(signature):
+    output = []
+    keys = ['x', 'a', 'e', 'i']
+    for k in keys:
+        if k in signature:
+            shape = signature[k]['shape']
+            dtype = signature[k]['dtype']
+            spec = signature[k]['spec']
+            output.append(spec(shape, dtype))
+    output = tuple(output)
+    if 'y' in signature:
+        shape = signature['y']['shape']
+        dtype = signature['y']['dtype']
+        spec = signature['y']['spec']
+        output = (output, spec(shape, dtype))
+
+    return output
diff --git a/spektral/utils/data.py b/spektral/utils/data.py
index d21cf4e3..b28b04f6 100644
--- a/spektral/utils/data.py
+++ b/spektral/utils/data.py
@@ -1,106 +1,3 @@
-import numpy as np
-import scipy.sparse as sp
 
-from spektral.utils import pad_jagged_array
 
 
-def numpy_to_disjoint(X_list, A_list, E_list=None):
-    """
-    Converts a batch of graphs stored in lists (X, A, and optionally E) to the
-    [disjoint mode](https://danielegrattarola.github.io/spektral/data/#disjoint-mode).
-
-    Each entry i of the lists should be associated to the same graph, i.e.,
-    `X_list[i].shape[0] == A_list[i].shape[0] == E_list[i].shape[0]`.
-
-    The method also computes the batch index `I`.
-
-    :param X_list: a list of np.arrays of shape `(N, F)`;
-    :param A_list: a list of np.arrays or sparse matrices of shape `(N, N)`;
-    :param E_list: a list of np.arrays of shape `(N, N, S)`;
-    :return:
-        -  `X_out`: a rank 2 array of shape `(n_nodes, F)`;
-        -  `A_out`: a rank 2 array of shape `(n_nodes, n_nodes)`;
-        -  `E_out`: (only if `E_list` is given) a rank 2 array of shape
-        `(n_edges, S)`;
-        -  `I_out`: a rank 1 array of shape `(n_nodes, )`;
-    """
-    X_out = np.vstack(X_list)
-    A_list = [sp.coo_matrix(a) for a in A_list]
-    if E_list is not None:
-        if E_list[0].ndim == 3:
-            E_list = [e[a.row, a.col] for e, a in zip(E_list, A_list)]
-        E_out = np.vstack(E_list)
-    A_out = sp.block_diag(A_list)
-    n_nodes = np.array([x.shape[0] for x in X_list])
-    I_out = np.repeat(np.arange(len(n_nodes)), n_nodes)
-    if E_list is not None:
-        return X_out, A_out, E_out, I_out
-    else:
-        return X_out, A_out, I_out
-
-
-def numpy_to_batch(X_list, A_list, E_list=None):
-    """
-    Converts a batch of graphs stored in lists (X, A, and optionally E) to the
-    [batch mode](https://danielegrattarola.github.io/spektral/data/#batch-mode)
-    by zero-padding all X, A and E matrices to have the same node dimensions
-    (`N_max`).
-
-    Each entry i of the lists should be associated to the same graph, i.e.,
-    `X_list[i].shape[0] == A_list[i].shape[0] == E_list[i].shape[0]`.
-
-    Note that if `A_list` contains sparse matrices, they will be converted to
-    dense np.arrays, which can be expensice.
-
-    :param X_list: a list of np.arrays of shape `(N, F)`;
-    :param A_list: a list of np.arrays or sparse matrices of shape `(N, N)`;
-    :param E_list: a list of np.arrays of shape `(N, N, S)`;
-    :return:
-        -  `X_out`: a rank 3 array of shape `(batch, N_max, F)`;
-        -  `A_out`: a rank 2 array of shape `(batch, N_max, N_max)`;
-        -  `E_out`: (only if `E_list` if given) a rank 2 array of shape
-        `(batch, N_max, N_max, S)`;
-    """
-    N_max = max([a.shape[-1] for a in A_list])
-    X_out = pad_jagged_array(X_list, (N_max, -1))
-    # Convert sparse matrices to dense
-    if hasattr(A_list[0], 'toarray'):
-        A_list = [a.toarray() for a in A_list]
-    A_out = pad_jagged_array(A_list, (N_max, N_max))
-    if E_list is not None:
-        E_out = pad_jagged_array(E_list, (N_max, N_max, -1))
-        return X_out, A_out, E_out
-    else:
-        return X_out, A_out
-
-
-def batch_iterator(data, batch_size=32, epochs=1, shuffle=True):
-    """
-    Iterates over the data for the given number of epochs, yielding batches of
-    size `batch_size`.
-    :param data: np.array or list of np.arrays with the same first dimension;
-    :param batch_size: number of samples in a batch;
-    :param epochs: number of times to iterate over the data;
-    :param shuffle: whether to shuffle the data at the beginning of each epoch
-    :return: batches of size `batch_size`.
-    """
-    if not isinstance(data, list):
-        data = [data]
-    if len(set([len(item) for item in data])) > 1:
-        raise ValueError('All arrays must have the same length')
-
-    len_data = len(data[0])
-    batches_per_epoch = int(len_data / batch_size)
-    if len_data % batch_size != 0:
-        batches_per_epoch += 1
-    for epochs in range(epochs):
-        if shuffle:
-            shuffle_idx = np.random.permutation(np.arange(len_data))
-            data = [np.array(item)[shuffle_idx] for item in data]
-        for batch in range(batches_per_epoch):
-            start = batch * batch_size
-            stop = min(start + batch_size, len_data)
-            if len(data) > 1:
-                yield [item[start:stop] for item in data]
-            else:
-                yield data[0][start:stop]
\ No newline at end of file
diff --git a/tests/data/test_dataset.py b/tests/data/test_dataset.py
new file mode 100644
index 00000000..638eaf94
--- /dev/null
+++ b/tests/data/test_dataset.py
@@ -0,0 +1,65 @@
+import numpy as np
+
+from spektral.data.dataset import Dataset
+from spektral.data.graph import Graph
+
+n_graphs = 10
+Ns = np.random.randint(3, 8, n_graphs)
+f = 3
+s = 3
+
+
+class TestDataset(Dataset):
+    def read(self):
+        return [
+            Graph(x=np.random.rand(n, f),
+                  adj=np.random.randint(0, 2, (n, n)),
+                  edge_attr=np.random.rand(n, n, s),
+                  y=np.array([0., 1.]))
+            for n in Ns
+        ]
+
+
+def test_dataset():
+    d = TestDataset()
+
+    assert d.F == f
+    assert d.S == s
+    assert d.n_out == 2
+
+    # _signature
+    for k in ['x', 'a', 'e', 'y']:
+        assert k in d.signature
+
+    # __getitem__
+    assert isinstance(d[0], Graph)
+    assert isinstance(d[:3], Dataset)
+    assert isinstance(d[[1, 3, 4]], Dataset)
+
+    # __setitem__
+    n = 100
+    g = Graph(x=np.random.rand(n, f),
+              adj=np.random.randint(0, 2, (n, n)),
+              edge_attr=np.random.rand(n, n, s),
+              y=np.array([0., 1.]))
+
+    # single assignment
+    d[0] = g
+    assert d[0].N == n and all([d_.N != n for d_ in d[1:]])
+
+    # Slice assignment
+    d[1:3] = [g] * 2
+    assert d[1].N == n and d[2].N == n and all([d_.N != n for d_ in d[3:]])
+
+    # List assignment
+    d[[3, 4]] = [g] * 2
+    assert d[3].N == n and d[4].N == n and all([d_.N != n for d_ in d[5:]])
+
+    # __len__
+    assert d.__len__() == n_graphs
+
+    # __repr__
+    print(d)
+
+    # Test that shuffling doesn't crash
+    np.random.shuffle(d)
diff --git a/tests/data/test_graph.py b/tests/data/test_graph.py
new file mode 100644
index 00000000..65d9012b
--- /dev/null
+++ b/tests/data/test_graph.py
@@ -0,0 +1,37 @@
+import numpy as np
+
+from spektral.data.graph import Graph
+
+N = 5
+F = 4
+S = 3
+n_out = 2
+
+
+def _check_graph(x, a, e, y):
+    g = Graph(x=x)
+    g = Graph(adj=a)
+    g = Graph(x=x, adj=a, edge_attr=e, y=y)
+
+    # numpy
+    g_np = g.numpy()
+    g_gt_names = ['x', 'adj', 'edge_attr', 'y']
+    g_gt = [x, a, e, y]
+    for i in range(len(g_gt)):
+        assert np.all(g_np[i] == g_gt[i])
+
+    # __getitem__
+    for i in range(len(g_gt)):
+        assert np.all(g.__getitem__(g_gt_names[i]) == g_gt[i])
+
+    # __repr__
+    print(g)
+
+
+def test_graph():
+    x = np.ones((N, F))
+    a = np.ones((N, N))
+    e = np.ones((N, N, S))
+    y = np.ones((n_out,))
+
+    _check_graph(x, a, e, y)
diff --git a/tests/data/test_loaders.py b/tests/data/test_loaders.py
new file mode 100644
index 00000000..493eb6c2
--- /dev/null
+++ b/tests/data/test_loaders.py
@@ -0,0 +1,53 @@
+import numpy as np
+
+from spektral.data import DisjointLoader, BatchLoader
+from spektral.data.dataset import Dataset
+from spektral.data.graph import Graph
+
+n_graphs = 10
+ns = np.random.randint(3, 8, n_graphs)
+f = 3
+s = 3
+batch_size = 6
+
+# batch size does not fit an integer number of times in n_graphs
+graphs_in_batch = n_graphs % batch_size
+assert graphs_in_batch != 0
+
+
+class TestDataset(Dataset):
+    def read(self):
+        return [
+            Graph(x=np.random.rand(n, f),
+                  adj=np.random.randint(0, 2, (n, n)),
+                  edge_attr=np.random.rand(n, n, s),
+                  y=np.array([0., 1.]))
+            for n in ns
+        ]
+
+
+def test_disjoint():
+    data = TestDataset()
+    loader = DisjointLoader(data, batch_size=batch_size)
+    batches = [b for b in loader]
+
+    (x, a, e, i), y = batches[-1]
+    n = sum(ns[-graphs_in_batch:])
+    assert x.shape == (n, f)
+    assert a.shape == (n, n)
+    assert len(e.shape) == 2 and e.shape[1] == s  # Avoid counting edges
+    assert i.shape == (n, )
+    assert y.shape == (graphs_in_batch, 2)
+
+
+def test_batch():
+    data = TestDataset()
+    loader = BatchLoader(data, batch_size=batch_size)
+    batches = [b for b in loader]
+
+    (x, a, e), y = batches[-1]
+    n = max(ns[-graphs_in_batch:])
+    assert x.shape == (graphs_in_batch, n, f)
+    assert a.shape == (graphs_in_batch, n, n)
+    assert e.shape == (graphs_in_batch, n, n, s)
+    assert y.shape == (graphs_in_batch, 2)
diff --git a/tests/data/test_utils.py b/tests/data/test_utils.py
new file mode 100644
index 00000000..ec34c633
--- /dev/null
+++ b/tests/data/test_utils.py
@@ -0,0 +1,46 @@
+import numpy as np
+
+from spektral.data import Dataset, Graph
+from spektral.data.utils import numpy_to_disjoint, numpy_to_batch, batch_generator
+from spektral.datasets import tud
+
+a_list, x_list, y = tud.load_data('ENZYMES', clean=True)
+
+
+def test_numpy_to_batch():
+    x, a = numpy_to_batch(x_list, a_list)
+    assert x.ndim == 3
+    assert a.ndim == 3
+    assert x.shape[0] == a.shape[0]
+    assert x.shape[1] == a.shape[1] == a.shape[2]
+
+
+def test_numpy_to_disjoint():
+    x, a, i = numpy_to_disjoint(x_list, a_list)
+    assert x.ndim == 2
+    assert a.ndim == 2
+    assert x.shape[0] == a.shape[0] == a.shape[1]
+
+
+def test_batch_generator():
+    size = 10
+    batch_size = 6
+    a = list(range(size))
+    b = np.arange(size)
+
+    class TestDataset(Dataset):
+        def read(self):
+            return [
+                Graph(x=np.random.rand(n, 2),
+                      adj=np.random.randint(0, 2, (n, n)),
+                      y=np.array([0., 1.]))
+                for n in range(size)
+            ]
+
+    c = TestDataset()
+
+    batches = batch_generator([a, b, c], batch_size=batch_size, epochs=10)
+    for batch in batches:
+        a_, b_, c_ = batch
+        for i in range(len(a_)):
+            assert a_[i] == b_[i] == c_[i].N
diff --git a/tests/test_utils.py b/tests/test_utils.py
deleted file mode 100644
index a622b12e..00000000
--- a/tests/test_utils.py
+++ /dev/null
@@ -1,20 +0,0 @@
-from spektral.datasets import tud
-from spektral.utils import numpy_to_batch, numpy_to_disjoint
-
-
-def test_utils_data():
-    # Load ENZYMES because we use it also in datasets tests
-    A_list, X_list, y = tud.load_data('ENZYMES', clean=True)
-
-    # Test numpy to batch
-    X_batch, A_batch = numpy_to_batch(X_list, A_list)
-    assert X_batch.ndim == 3
-    assert A_batch.ndim == 3
-    assert X_batch.shape[0] == A_batch.shape[0]
-    assert X_batch.shape[1] == A_batch.shape[1] == A_batch.shape[2]
-
-    # Test numpy to disjoint
-    X_disj, A_disj, I_disj = numpy_to_disjoint(X_list, A_list)
-    assert X_disj.ndim == 2
-    assert A_disj.ndim == 2
-    assert X_disj.shape[0] == A_disj.shape[0] == A_disj.shape[1]

From e42261c6fde7e8c1fba6b357d3c38fe723f9681f Mon Sep 17 00:00:00 2001
From: Daniele Grattarola <daniele.grattarola@gmail.com>
Date: Wed, 28 Oct 2020 17:42:03 +0100
Subject: [PATCH 02/57] Remove spektral.utils.data Fix imports

---
 examples/other/graph_signal_classification_mnist.py | 8 ++++----
 spektral/utils/__init__.py                          | 3 +--
 spektral/utils/data.py                              | 3 ---
 3 files changed, 5 insertions(+), 9 deletions(-)
 delete mode 100644 spektral/utils/data.py

diff --git a/examples/other/graph_signal_classification_mnist.py b/examples/other/graph_signal_classification_mnist.py
index b099d68c..07346d03 100644
--- a/examples/other/graph_signal_classification_mnist.py
+++ b/examples/other/graph_signal_classification_mnist.py
@@ -10,7 +10,7 @@
 from spektral.datasets import mnist
 from spektral.layers import GraphConv
 from spektral.layers.ops import sp_matrix_to_sp_tensor
-from spektral.utils import batch_iterator
+from spektral.data.utils import batch_generator
 
 # Parameters
 learning_rate = 1e-3  # Learning rate for Adam
@@ -87,7 +87,7 @@ def evaluate(x, y):
 current_patience = patience
 curent_batch = 0
 batches_in_epoch = int(np.ceil(x_tr.shape[0] / batch_size))
-batches_tr = batch_iterator([x_tr, y_tr], batch_size=batch_size, epochs=epochs)
+batches_tr = batch_generator([x_tr, y_tr], batch_size=batch_size, epochs=epochs)
 
 # Training loop
 results_tr = []
@@ -100,7 +100,7 @@ def evaluate(x, y):
     results_tr.append((l, a))
 
     if curent_batch == batches_in_epoch:
-        batches_va = batch_iterator([x_va, y_va], batch_size=batch_size)
+        batches_va = batch_generator([x_va, y_va], batch_size=batch_size)
         results_va = [evaluate(*batch) for batch in batches_va]
         results_va = np.array(results_va)
         loss_va, acc_va = results_va.mean(0)
@@ -108,7 +108,7 @@ def evaluate(x, y):
             best_val_loss = loss_va
             current_patience = patience
             # Test
-            batches_te = batch_iterator([x_te, y_te], batch_size=batch_size)
+            batches_te = batch_generator([x_te, y_te], batch_size=batch_size)
             results_te = [evaluate(*batch) for batch in batches_te]
             results_te = np.array(results_te)
         else:
diff --git a/spektral/utils/__init__.py b/spektral/utils/__init__.py
index 15432cf0..13dbcb33 100644
--- a/spektral/utils/__init__.py
+++ b/spektral/utils/__init__.py
@@ -2,5 +2,4 @@
 from .convolution import *
 from .logging import *
 from .misc import *
-from .io import *
-from .data import *
+from .io import *
\ No newline at end of file
diff --git a/spektral/utils/data.py b/spektral/utils/data.py
deleted file mode 100644
index b28b04f6..00000000
--- a/spektral/utils/data.py
+++ /dev/null
@@ -1,3 +0,0 @@
-
-
-

From 309e5f0456b886490ff289c8d667e6bd675228ec Mon Sep 17 00:00:00 2001
From: Daniele Grattarola <daniele.grattarola@gmail.com>
Date: Fri, 30 Oct 2020 16:03:30 +0100
Subject: [PATCH 03/57] - Implement QM9 dataset with new Dataset class - Update
 QM9 examples and tests - Improve memory usage of localpooling_filter -
 Improve Dataset.__getitem__

---
 docs/autogen.py                               |   4 +-
 docs/templates/data.md                        |   4 +-
 examples/graph_prediction/BDGC_disjoint.py    |   6 +-
 .../graph_prediction/ogbg-mol-hiv_disjoint.py |   6 +-
 examples/graph_prediction/qm9_batch.py        |  51 ++---
 examples/graph_prediction/qm9_disjoint.py     |  83 +++-----
 examples/graph_prediction/tud_disjoint.py     |   6 +-
 spektral/data/dataset.py                      |   9 +-
 spektral/data/loaders.py                      |   6 +-
 spektral/data/utils.py                        |  70 +++++--
 spektral/datasets/qm9.py                      | 178 +++++++++---------
 spektral/utils/convolution.py                 |  34 ++--
 tests/data/test_utils.py                      |  12 +-
 tests/test_datasets.py                        |  12 +-
 14 files changed, 242 insertions(+), 239 deletions(-)

diff --git a/docs/autogen.py b/docs/autogen.py
index 0086b5dd..a9d7c306 100644
--- a/docs/autogen.py
+++ b/docs/autogen.py
@@ -143,8 +143,8 @@
     {
         'page': 'utils/data.md',
         'functions': [
-            spektral.data.utils.numpy_to_disjoint,
-            spektral.data.utils.numpy_to_batch,
+            spektral.data.utils.to_disjoint,
+            spektral.data.utils.to_batch,
             spektral.data.utils.batch_generator
         ]
     },
diff --git a/docs/templates/data.md b/docs/templates/data.md
index c917e422..fbd9a69b 100644
--- a/docs/templates/data.md
+++ b/docs/templates/data.md
@@ -70,10 +70,10 @@ Hierarchical pooling layers will return a reduced version of `I` along with the
 Utilities for creating the disjoint union of a list of graphs are provided in `spektral.utils.data`:
 
 ```py
->>> from spektral.utils.data import numpy_to_disjoint
+>>> from spektral.utils.data import to_disjoint
 >>> A_list = [np.ones((2, 2)), np.ones((3, 3))]  # One graph has 2 nodes, the other has 3
 >>> X_list = [np.random.randn(2, 4), np.random.randn(3, 4)]  # F = 4
->>> X, A, I = numpy_to_disjoint(X_list, A_list)
+>>> X, A, I = to_disjoint(X_list, A_list)
 >>> X.shape
 (5, 4)
 >>> A.shape
diff --git a/examples/graph_prediction/BDGC_disjoint.py b/examples/graph_prediction/BDGC_disjoint.py
index b00bed9f..01ebfd34 100644
--- a/examples/graph_prediction/BDGC_disjoint.py
+++ b/examples/graph_prediction/BDGC_disjoint.py
@@ -21,14 +21,14 @@
 from spektral.layers import ops
 from spektral.layers.pooling import TopKPool
 from spektral.utils.convolution import normalized_adjacency
-from spektral.data.utils import numpy_to_disjoint, batch_generator
+from spektral.data.utils import to_disjoint, batch_generator
 
 
 def evaluate(A_list, X_list, y_list, ops_list, batch_size):
     batches = batch_generator([X_list, A_list, y_list], batch_size=batch_size)
     output = []
     for b in batches:
-        X, A, I = numpy_to_disjoint(*b[:-1])
+        X, A, I = to_disjoint(*b[:-1])
         A = ops.sp_matrix_to_sp_tensor(A)
         y = b[-1]
         pred = model([X, A, I], training=False)
@@ -129,7 +129,7 @@ def train_step(X_, A_, I_, y_):
 for b in batches:
     current_batch += 1
 
-    X_, A_, I_ = numpy_to_disjoint(*b[:-1])
+    X_, A_, I_ = to_disjoint(*b[:-1])
     A_ = ops.sp_matrix_to_sp_tensor(A_)
     y_ = b[-1]
     outs = train_step(X_, A_, I_, y_)
diff --git a/examples/graph_prediction/ogbg-mol-hiv_disjoint.py b/examples/graph_prediction/ogbg-mol-hiv_disjoint.py
index f891841c..656c9c28 100644
--- a/examples/graph_prediction/ogbg-mol-hiv_disjoint.py
+++ b/examples/graph_prediction/ogbg-mol-hiv_disjoint.py
@@ -16,7 +16,7 @@
 
 from spektral.datasets import ogb
 from spektral.layers import EdgeConditionedConv, ops, GlobalSumPool
-from spektral.data.utils import numpy_to_disjoint, batch_generator
+from spektral.data.utils import to_disjoint, batch_generator
 
 ################################################################################
 # PARAMETERS
@@ -89,7 +89,7 @@ def train_step(X_, A_, E_, I_, y_):
 batches_train = batch_generator([X_tr, A_tr, E_tr, y_tr],
                                 batch_size=batch_size, epochs=epochs)
 for b in batches_train:
-    X_, A_, E_, I_ = numpy_to_disjoint(*b[:-1])
+    X_, A_, E_, I_ = to_disjoint(*b[:-1])
     A_ = ops.sp_matrix_to_sp_tensor(A_)
     y_ = b[-1]
     outs = train_step(X_, A_, E_, I_, y_)
@@ -109,7 +109,7 @@ def train_step(X_, A_, E_, I_, y_):
 y_pred = []
 batches_test = batch_generator([X_te, A_te, E_te], batch_size=batch_size)
 for b in batches_test:
-    X_, A_, E_, I_ = numpy_to_disjoint(*b)
+    X_, A_, E_, I_ = to_disjoint(*b)
     A_ = ops.sp_matrix_to_sp_tensor(A_)
     p = model([X_, A_, E_, I_], training=False)
     y_pred.append(p.numpy())
diff --git a/examples/graph_prediction/qm9_batch.py b/examples/graph_prediction/qm9_batch.py
index 7a954bbb..acbdd3fb 100644
--- a/examples/graph_prediction/qm9_batch.py
+++ b/examples/graph_prediction/qm9_batch.py
@@ -9,7 +9,9 @@
 from tensorflow.keras.models import Model
 from tensorflow.keras.optimizers import Adam
 
+from spektral.data import BatchLoader
 from spektral.datasets import qm9
+from spektral.datasets.qm9 import QM9
 from spektral.layers import EdgeConditionedConv, GlobalSumPool
 from spektral.utils import label_to_one_hot
 
@@ -18,45 +20,29 @@
 ################################################################################
 learning_rate = 1e-3  # Learning rate
 epochs = 10           # Number of training epochs
-batch_size = 32           # Batch size
+batch_size = 32       # Batch size
 
 ################################################################################
 # LOAD DATA
 ################################################################################
-A, X, E, y = qm9.load_data(return_type='numpy',
-                           nf_keys='atomic_num',
-                           ef_keys='type',
-                           self_loops=True,
-                           amount=1000)  # Set to None to train on whole dataset
-y = y[['cv']].values  # Heat capacity at 298.15K
-
-# Preprocessing
-X_uniq = np.unique(X)
-X_uniq = X_uniq[X_uniq != 0]
-E_uniq = np.unique(E)
-E_uniq = E_uniq[E_uniq != 0]
-
-X = label_to_one_hot(X, X_uniq)
-E = label_to_one_hot(E, E_uniq)
+dataset = QM9(amount=1000)  # Set amount=None to train on whole dataset
 
 # Parameters
-N = X.shape[-2]       # Number of nodes in the graphs
-F = X[0].shape[-1]    # Dimension of node features
-S = E[0].shape[-1]    # Dimension of edge features
-n_out = y.shape[-1]   # Dimension of the target
+F = dataset.F          # Dimension of node features
+S = dataset.S          # Dimension of edge features
+n_out = dataset.n_out  # Dimension of the target
 
 # Train/test split
-A_train, A_test, \
-X_train, X_test, \
-E_train, E_test, \
-y_train, y_test = train_test_split(A, X, E, y, test_size=0.1, random_state=0)
+idxs = np.random.permutation(len(dataset))
+split = int(0.9 * len(dataset))
+dataset_tr, dataset_te = dataset[:split], dataset[split:]
 
 ################################################################################
 # BUILD MODEL
 ################################################################################
-X_in = Input(shape=(N, F))
-A_in = Input(shape=(N, N))
-E_in = Input(shape=(N, N, S))
+X_in = Input(shape=(None, F))
+A_in = Input(shape=(None, None))
+E_in = Input(shape=(None, None, S))
 
 X_1 = EdgeConditionedConv(32, activation='relu')([X_in, A_in, E_in])
 X_2 = EdgeConditionedConv(32, activation='relu')([X_1, A_in, E_in])
@@ -72,16 +58,15 @@
 ################################################################################
 # FIT MODEL
 ################################################################################
-model.fit([X_train, A_train, E_train],
-          y_train,
-          batch_size=batch_size,
+loader_tr = BatchLoader(dataset_tr, batch_size=batch_size, epochs=epochs)
+model.fit(loader_tr,
+          steps_per_epoch=loader_tr.steps_per_epoch,
           epochs=epochs)
 
 ################################################################################
 # EVALUATE MODEL
 ################################################################################
 print('Testing model')
-model_loss = model.evaluate([X_test, A_test, E_test],
-                            y_test,
-                            batch_size=batch_size)
+loader_te = BatchLoader(dataset_te, batch_size=batch_size)
+model_loss = model.evaluate(loader_te)
 print('Done. Test loss: {}'.format(model_loss))
diff --git a/examples/graph_prediction/qm9_disjoint.py b/examples/graph_prediction/qm9_disjoint.py
index 923a1acb..13217fa0 100644
--- a/examples/graph_prediction/qm9_disjoint.py
+++ b/examples/graph_prediction/qm9_disjoint.py
@@ -11,9 +11,11 @@
 from tensorflow.keras.models import Model
 from tensorflow.keras.optimizers import Adam
 
+from spektral.data import DisjointLoader
 from spektral.datasets import qm9
+from spektral.datasets.qm9 import QM9
 from spektral.layers import EdgeConditionedConv, ops, GlobalSumPool
-from spektral.data.utils import numpy_to_disjoint, batch_generator
+from spektral.data.utils import to_disjoint, batch_generator
 from spektral.utils import label_to_one_hot
 
 ################################################################################
@@ -26,33 +28,17 @@
 ################################################################################
 # LOAD DATA
 ################################################################################
-A, X, E, y = qm9.load_data(return_type='numpy',
-                           nf_keys='atomic_num',
-                           ef_keys='type',
-                           self_loops=False,
-                           auto_pad=False,
-                           amount=1000)  # Set to None to train on whole dataset
-y = y[['cv']].values  # Heat capacity at 298.15K
-
-# Preprocessing
-X_uniq = np.unique([v for x in X for v in np.unique(x)])
-E_uniq = np.unique([v for e in E for v in np.unique(e)])
-X_uniq = X_uniq[X_uniq != 0]
-E_uniq = E_uniq[E_uniq != 0]
-
-X = [label_to_one_hot(x, labels=X_uniq) for x in X]
-E = [label_to_one_hot(e, labels=E_uniq) for e in E]
+dataset = QM9(amount=1000)  # Set amount=None to train on whole dataset
 
 # Parameters
-F = X[0].shape[-1]   # Dimension of node features
-S = E[0].shape[-1]   # Dimension of edge features
-n_out = y.shape[-1]  # Dimension of the target
+F = dataset.F          # Dimension of node features
+S = dataset.S          # Dimension of edge features
+n_out = dataset.n_out  # Dimension of the target
 
 # Train/test split
-A_train, A_test, \
-X_train, X_test, \
-E_train, E_test, \
-y_train, y_test = train_test_split(A, X, E, y, test_size=0.1, random_state=0)
+idxs = np.random.permutation(len(dataset))
+split = int(0.9 * len(dataset))
+dataset_tr, dataset_te = dataset[:split], dataset[split:]
 
 ################################################################################
 # BUILD MODEL
@@ -74,16 +60,16 @@
 
 
 @tf.function(
-    input_signature=(tf.TensorSpec((None, F), dtype=tf.float64),
-                     tf.SparseTensorSpec((None, None), dtype=tf.float64),
-                     tf.TensorSpec((None, S), dtype=tf.float64),
-                     tf.TensorSpec((None,), dtype=tf.int32),
+    input_signature=((tf.TensorSpec((None, F), dtype=tf.float64),
+                      tf.SparseTensorSpec((None, None), dtype=tf.int64),
+                      tf.TensorSpec((None, S), dtype=tf.float64),
+                      tf.TensorSpec((None,), dtype=tf.int64)),
                      tf.TensorSpec((None, n_out), dtype=tf.float64)),
     experimental_relax_shapes=True)
-def train_step(X_, A_, E_, I_, y_):
+def train_step(inputs, target):
     with tf.GradientTape() as tape:
-        predictions = model([X_, A_, E_, I_], training=True)
-        loss = loss_fn(y_, predictions)
+        predictions = model(inputs, training=True)
+        loss = loss_fn(target, predictions)
         loss += sum(model.losses)
     gradients = tape.gradient(loss, model.trainable_variables)
     opt.apply_gradients(zip(gradients, model.trainable_variables))
@@ -95,21 +81,16 @@ def train_step(X_, A_, E_, I_, y_):
 ################################################################################
 current_batch = 0
 model_loss = 0
-batches_in_epoch = np.ceil(len(A_train) / batch_size)
 
 print('Fitting model')
-batches_train = batch_generator([X_train, A_train, E_train, y_train],
-                                batch_size=batch_size, epochs=epochs)
-for b in batches_train:
-    X_, A_, E_, I_ = numpy_to_disjoint(*b[:-1])
-    A_ = ops.sp_matrix_to_sp_tensor(A_)
-    y_ = b[-1]
-    outs = train_step(X_, A_, E_, I_, y_)
-
-    model_loss += outs.numpy()
+loader_tr = DisjointLoader(dataset_tr, batch_size=batch_size, epochs=epochs, shuffle=True)
+for batch in loader_tr:
+    outs = train_step(*batch)
+
+    model_loss += outs
     current_batch += 1
-    if current_batch == batches_in_epoch:
-        print('Loss: {}'.format(model_loss / batches_in_epoch))
+    if current_batch == loader_tr.steps_per_epoch:
+        print('Loss: {}'.format(model_loss / loader_tr.steps_per_epoch))
         model_loss = 0
         current_batch = 0
 
@@ -118,14 +99,10 @@ def train_step(X_, A_, E_, I_, y_):
 ################################################################################
 print('Testing model')
 model_loss = 0
-batches_in_epoch = np.ceil(len(A_test) / batch_size)
-batches_test = batch_generator([X_test, A_test, E_test, y_test], batch_size=batch_size)
-for b in batches_test:
-    X_, A_, E_, I_ = numpy_to_disjoint(*b[:-1])
-    A_ = ops.sp_matrix_to_sp_tensor(A_)
-    y_ = b[3]
-
-    predictions = model([X_, A_, E_, I_], training=False)
-    model_loss += loss_fn(y_, predictions)
-model_loss /= batches_in_epoch
+loader_te = DisjointLoader(dataset_te, batch_size=batch_size)
+for batch in loader_te:
+    inputs, target = batch
+    predictions = model(inputs, training=False)
+    model_loss += loss_fn(target, predictions)
+model_loss /= loader_te.steps_per_epoch
 print('Done. Test loss: {}'.format(model_loss))
diff --git a/examples/graph_prediction/tud_disjoint.py b/examples/graph_prediction/tud_disjoint.py
index dd6c47da..f7f557b1 100644
--- a/examples/graph_prediction/tud_disjoint.py
+++ b/examples/graph_prediction/tud_disjoint.py
@@ -15,7 +15,7 @@
 
 from spektral.datasets import tud
 from spektral.layers import GINConv, GlobalAvgPool, ops
-from spektral.data.utils import numpy_to_disjoint, batch_generator
+from spektral.data.utils import to_disjoint, batch_generator
 
 ################################################################################
 # PARAMETERS
@@ -103,7 +103,7 @@ def train_step(x_, a_, i_, y_):
 batches_train = batch_generator([x_train, a_train, y_train],
                                 batch_size=batch_size, epochs=epochs)
 for b in batches_train:
-    x_, a_, i_ = numpy_to_disjoint(*b[:-1])
+    x_, a_, i_ = to_disjoint(*b[:-1])
     a_ = ops.sp_matrix_to_sp_tensor(a_)
     y_ = b[-1]
     lss, acc = train_step(x_, a_, i_, y_)
@@ -126,7 +126,7 @@ def train_step(x_, a_, i_, y_):
 batches_in_epoch = np.ceil(len(a_test) / batch_size)
 batches_test = batch_generator([x_test, a_test, y_test], batch_size=batch_size)
 for b in batches_test:
-    x_, a_, i_ = numpy_to_disjoint(*b[:-1])
+    x_, a_, i_ = to_disjoint(*b[:-1])
     a_ = ops.sp_matrix_to_sp_tensor(a_)
     y_ = b[-1]
     predictions = model([x_, a_, i_], training=False)
diff --git a/spektral/data/dataset.py b/spektral/data/dataset.py
index 18855307..7084ad0b 100644
--- a/spektral/data/dataset.py
+++ b/spektral/data/dataset.py
@@ -5,6 +5,8 @@
 from spektral.data import Graph
 from spektral.data.utils import get_spec
 
+import numpy as np
+
 
 class Dataset:
     """
@@ -88,10 +90,11 @@ def _signature(self):
         return signature
 
     def __getitem__(self, key):
-        if not (isinstance(key, (int, slice, list, tuple))):
+        if not (np.issubdtype(type(key), np.integer) or
+                isinstance(key, (slice, list, tuple))):
             raise ValueError('Unsupported key type: {}'.format(type(key)))
-        if isinstance(key, int):
-            return self.graphs[key]
+        if np.issubdtype(type(key), np.integer):
+            return self.graphs[int(key)]
         else:
             dataset = copy.copy(self)
             if isinstance(key, slice):
diff --git a/spektral/data/loaders.py b/spektral/data/loaders.py
index d92905d2..f93ec258 100644
--- a/spektral/data/loaders.py
+++ b/spektral/data/loaders.py
@@ -4,7 +4,7 @@
 import tensorflow as tf
 from scipy import sparse as sp
 
-from spektral.data.utils import prepend_none, output_signature, numpy_to_disjoint, numpy_to_batch, batch_generator
+from spektral.data.utils import prepend_none, output_signature, to_disjoint, to_batch, batch_generator
 from spektral.layers.ops import sp_matrix_to_sp_tensor
 
 version = tf.__version__.split('.')
@@ -63,7 +63,7 @@ def tf(self):
     def collate(self, data_list):
         data_packed = self._pack(data_list)
         y = np.array(data_packed[-1])
-        ret = numpy_to_batch(*data_packed[:-1])
+        ret = to_batch(*data_packed[:-1])
 
         return ret, y
 
@@ -95,7 +95,7 @@ def tf(self):
     def collate(self, data_list):
         data_packed = self._pack(data_list)
         y = np.array(data_packed[-1])
-        ret = numpy_to_disjoint(*data_packed[:-1])
+        ret = to_disjoint(*data_packed[:-1])
         ret = list(ret)
         for i in range(len(ret)):
             if sp.issparse(ret[i]):
diff --git a/spektral/data/utils.py b/spektral/data/utils.py
index 02af2083..735abd34 100644
--- a/spektral/data/utils.py
+++ b/spektral/data/utils.py
@@ -5,7 +5,16 @@
 from spektral.utils import pad_jagged_array
 
 
-def numpy_to_disjoint(x_list, a_list, e_list=None):
+def _check_input(x_list, a_list, e_list=None):
+    if not len(x_list) == len(a_list):
+        raise ValueError('x_list and a_list must have the same length')
+    if e_list is not None and len(e_list) != len(x_list):
+        raise ValueError('x_list, a_list, and e_list must have the same length')
+    if len(x_list) < 1:
+        raise ValueError('Need at least one graph')
+
+
+def to_disjoint(x_list, a_list, e_list=None):
     """
     Converts lists of node features, adjacency matrices and (optionally) edge
     features to [disjoint mode](https://danielegrattarola.github.io/spektral/data/#disjoint-mode).
@@ -15,42 +24,63 @@ def numpy_to_disjoint(x_list, a_list, e_list=None):
     The method also computes the batch index to retrieve individual graphs
     from the disjoint union.
 
+    The edge attributes of a graph can be represented as
+
+    - a dense array of shape `(N, N, S)`;
+    - a sparse edge list of shape `(n_edges, S)`;
+
+    and they will always be returned as edge list for efficiency.
+
     :param x_list: a list of np.arrays of shape `(N, F)` -- note that `N` can
     change between graphs;
     :param a_list: a list of np.arrays or scipy.sparse matrices of shape
     `(N, N)`;
-    :param e_list: a list of np.arrays of shape `(N, N, S)`;
+    :param e_list: a list of np.arrays of shape `(N, N, S)` or `(n_edges, S)`;
     :return:
         -  `x`: np.array of shape `(n_nodes, F)`;
         -  `a`: scipy.sparse matrix of shape `(n_nodes, n_nodes)`;
-        -  `e`: (only if `e_list` is given) np.array of shape `(n_edges, S)`;
+        -  `e`: (optional) np.array of shape `(n_edges, S)`;
         -  `i`: np.array of shape `(n_nodes, )`;
     """
+    _check_input(x_list, a_list, e_list)
+
+    # Node features
     x_out = np.vstack(x_list)
-    a_list = [sp.coo_matrix(a) for a in a_list]
-    if e_list is not None:
-        if e_list[0].ndim == 3:
-            e_list = [e[a.row, a.col] for e, a in zip(e_list, a_list)]
-        e_out = np.vstack(e_list)
+
+    # Adjacency matrix
     a_out = sp.block_diag(a_list)
+
+    # Batch index
     n_nodes = np.array([x.shape[0] for x in x_list])
     i_out = np.repeat(np.arange(len(n_nodes)), n_nodes)
+
+    # Edge attributes
     if e_list is not None:
+        if e_list[0].ndim == 3:  # Convert dense to sparse
+            e_list = [e[a.row, a.col] for e, a in zip(e_list, a_list)]
+        e_out = np.vstack(e_list)
         return x_out, a_out, e_out, i_out
     else:
         return x_out, a_out, i_out
 
 
-def numpy_to_batch(x_list, a_list, e_list=None):
+def to_batch(x_list, a_list, e_list=None):
     """
     Converts lists of node features, adjacency matrices and (optionally) edge 
     features to [batch mode](https://danielegrattarola.github.io/spektral/data/#batch-mode),
-    by zero-padding all X, A and E matrices to have the same node dimensions.
+    by zero-padding all tensors to have the same node dimension `n_max`.
 
     The i-th element of each list must be associated with the i-th graph.
 
-    Note that if `a_list` contains sparse matrices, they will be converted to
-    dense np.arrays, which can be memory-expensive.
+    If `a_list` contains sparse matrices, they will be converted to dense
+    np.arrays, which can be expensive.
+
+    The edge attributes of a graph can be represented as
+
+    - a dense array of shape `(N, N, S)`;
+    - a sparse edge list of shape `(n_edges, S)`;
+
+    and they will always be returned as dense arrays.
 
     :param x_list: a list of np.arrays of shape `(N, F)` -- note that `N` can
     change between graphs;
@@ -63,13 +93,25 @@ def numpy_to_batch(x_list, a_list, e_list=None):
         -  `e`: (only if `e_list` is given) np.array of shape
         `(batch, n_max, n_max, S)`;
     """
+    _check_input(x_list, a_list, e_list)
     n_max = max([a.shape[-1] for a in a_list])
+
+    # Node features
     x_out = pad_jagged_array(x_list, (n_max, -1))
-    # Convert sparse matrices to dense
-    if hasattr(a_list[0], 'toarray'):
+
+    # Adjacency matrix
+    if hasattr(a_list[0], 'toarray'):  # Convert sparse to dense
         a_list = [a.toarray() for a in a_list]
     a_out = pad_jagged_array(a_list, (n_max, n_max))
+
+    # Edge attributes
     if e_list is not None:
+        if e_list[0].ndim == 2:  # Sparse to dense
+            for i in range(len(a_list)):
+                a, e = a_list[i], e_list[i]
+                e_new = np.zeros(a.shape + e.shape[-1:])
+                e_new[np.nonzero(a)] = e
+                e_list[i] = e_new
         e_out = pad_jagged_array(e_list, (n_max, n_max, -1))
         return x_out, a_out, e_out
     else:
diff --git a/spektral/datasets/qm9.py b/spektral/datasets/qm9.py
index 73febd5c..20d0ded8 100644
--- a/spektral/datasets/qm9.py
+++ b/spektral/datasets/qm9.py
@@ -1,109 +1,105 @@
 import os
 
+import numpy as np
+import scipy.sparse as sp
 from tensorflow.keras.utils import get_file
 
-from spektral.chem import sdf_to_nx
-from spektral.utils import nx_to_numpy
+from spektral.data import Dataset, Graph
+from spektral.utils import label_to_one_hot
 from spektral.utils.io import load_csv, load_sdf
 
 DATA_PATH = os.path.expanduser('~/.spektral/datasets/qm9/')
 DATASET_URL = 'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/gdb9.tar.gz'
-RETURN_TYPES = {'numpy', 'networkx', 'sdf'}
-NODE_FEATURES = ['atomic_num', 'charge', 'coords', 'iso']
-EDGE_FEATURES = ['type', 'stereo']
-MAX_K = 9
 
+ATOM_TYPES = [1, 6, 7, 8, 9]
+BOND_TYPES = [1, 2, 3, 4]
 
-def load_data(nf_keys=None, ef_keys=None, auto_pad=True, self_loops=False,
-              amount=None, return_type='numpy'):
+
+class QM9(Dataset):
     """
-    Loads the QM9 chemical data set of small molecules.
-
-    Nodes represent heavy atoms (hydrogens are discarded), edges represent
-    chemical bonds.
-
-    The node features represent the chemical properties of each atom, and are
-    loaded according to the `nf_keys` argument.
-    See `spektral.datasets.qm9.NODE_FEATURES` for possible node features, and
-    see [this link](http://www.nonlinear.com/progenesis/sdf-studio/v0.9/faq/sdf-file-format-guidance.aspx)
-    for the meaning of each property. Usually, it is sufficient to load the
-    atomic number.
-
-    The edge features represent the type and stereoscopy of each chemical bond
-    between two atoms.
-    See `spektral.datasets.qm9.EDGE_FEATURES` for possible edge features, and
-    see [this link](http://www.nonlinear.com/progenesis/sdf-studio/v0.9/faq/sdf-file-format-guidance.aspx)
-    for the meaning of each property. Usually, it is sufficient to load the
-    type of bond.
-
-    :param nf_keys: list or str, node features to return (see `qm9.NODE_FEATURES`
-    for available features);
-    :param ef_keys: list or str, edge features to return (see `qm9.EDGE_FEATURES`
-    for available features);
-    :param auto_pad: if `return_type='numpy'`, zero pad graph matrices to have 
-    the same number of nodes;
-    :param self_loops: if `return_type='numpy'`, add self loops to adjacency 
-    matrices;
-    :param amount: the amount of molecules to return (in ascending order by
-    number of atoms).
-    :param return_type: `'numpy'`, `'networkx'`, or `'sdf'`, data format to return;
-    :return:
-    - if `return_type='numpy'`, the adjacency matrix, node features,
-    edge features, and a Pandas dataframe containing labels;
-    - if `return_type='networkx'`, a list of graphs in Networkx format,
-    and a dataframe containing labels;   
-    - if `return_type='sdf'`, a list of molecules in the internal SDF format and
-    a dataframe containing labels.
+    The QM9 chemical data set of small molecules.
+
+    In this dataset, nodes represent atoms and edges represent chemical bonds.
+    There are 5 possible atom types (H, C, N, O, F) and 4 bond types (single,
+    double, triple, aromatic).
+
+    Node features represent the chemical properties of each atom and include:
+
+    - The atomic number, one-hot encoded;
+    - The atom's position in the X, Y, and Z dimensions;
+    - The atomic charge;
+    - The mass difference from the monoisotope;
+
+    See [this link](http://www.nonlinear.com/progenesis/sdf-studio/v0.9/faq/sdf-file-format-guidance.aspx)
+    for more information.
+
+    The edge features represent the type of chemical bond between two atoms.
+
+    Labels represent... TODO
+
+    **Arguments**
+
+    - `amount`: int, load this many molecules instead of the full dataset
+    (useful for debugging).
     """
-    if return_type not in RETURN_TYPES:
-        raise ValueError('Possible return_type: {}'.format(RETURN_TYPES))
-
-    if not os.path.exists(DATA_PATH):
-        _download_data()  # Try to download dataset
-
-    print('Loading QM9 dataset.')
-    sdf_file = os.path.join(DATA_PATH, 'qm9.sdf')
-    data = load_sdf(sdf_file, amount=amount)  # Internal SDF format
-
-    # Load labels
-    labels_file = os.path.join(DATA_PATH, 'qm9.sdf.csv')
-    labels = load_csv(labels_file)
-    if amount is not None:
-        labels = labels[:amount]
-    if return_type == 'sdf':
-        return data, labels
-    else:
-        # Convert to Networkx
-        data = [sdf_to_nx(_) for _ in data]
-
-    if return_type == 'numpy':
-        if nf_keys is not None:
-            if isinstance(nf_keys, str):
-                nf_keys = [nf_keys]
-        else:
-            nf_keys = NODE_FEATURES
-        if ef_keys is not None:
-            if isinstance(ef_keys, str):
-                ef_keys = [ef_keys]
-        else:
-            ef_keys = EDGE_FEATURES
-
-        adj, nf, ef = nx_to_numpy(data,
-                                  auto_pad=auto_pad, self_loops=self_loops,
-                                  nf_keys=nf_keys, ef_keys=ef_keys)
-        return adj, nf, ef, labels
-    elif return_type == 'networkx':
-        return data, labels
-    else:
-        # Should not get here
-        raise RuntimeError()
+    def __init__(self, amount=None, **kwargs):
+        self.amount = amount
+        super().__init__(**kwargs)
+
+    def read(self):
+        if not os.path.exists(DATA_PATH):
+            _download_data()  # Try to download dataset
+
+        # Load molecular graphs
+        print('Loading QM9 dataset.')
+        sdf_file = os.path.join(DATA_PATH, 'qm9.sdf')
+        data = load_sdf(sdf_file, amount=self.amount)  # Internal SDF format
+
+        x_list, a_list, e_list = [], [], []
+        for mol in data:
+            x = np.array([atom_to_feature(atom) for atom in mol['atoms']])
+            a, e = mol_to_adj(mol)
+            x_list += [x]
+            a_list += [a]
+            e_list += [e]
+
+        # Load labels
+        labels_file = os.path.join(DATA_PATH, 'qm9.sdf.csv')
+        labels = load_csv(labels_file)
+        labels = labels.set_index('mol_id').values[:, 1:]
+        if self.amount is not None:
+            labels = labels[:self.amount]
+
+        return [Graph(x=x, adj=a, edge_attr=e, y=y)
+                for x, a, e, y in zip(x_list, a_list, e_list, labels)]
+
+
+def atom_to_feature(atom):
+    atomic_num = label_to_one_hot(atom['atomic_num'], ATOM_TYPES)
+    coords = atom['coords']
+    charge = atom['charge']
+    iso = atom['iso']
+
+    return np.concatenate((atomic_num, coords, [charge, iso]), -1)
+
+
+def mol_to_adj(mol):
+    row, col, edge_attr = [], [], []
+    for bond in mol['bonds']:
+        start, end = bond['start_atom'], bond['end_atom']
+        row += [start, end]
+        col += [end, start]
+        edge_attr += [bond['type']] * 2
+
+    a = sp.csr_matrix((np.ones_like(row), (row, col)))
+    edge_attr = np.array([label_to_one_hot(e, BOND_TYPES)
+                          for e in edge_attr])
+    return a, edge_attr
 
 
 def _download_data():
-    _ = get_file(
-        'qm9.tar.gz', DATASET_URL,
-        extract=True, cache_dir=DATA_PATH, cache_subdir=DATA_PATH
-    )
+    _ = get_file('qm9.tar.gz', DATASET_URL, extract=True, cache_dir=DATA_PATH,
+                 cache_subdir=DATA_PATH)
     os.rename(DATA_PATH + 'gdb9.sdf', DATA_PATH + 'qm9.sdf')
     os.rename(DATA_PATH + 'gdb9.sdf.csv', DATA_PATH + 'qm9.sdf.csv')
     os.remove(DATA_PATH + 'qm9.tar.gz')
diff --git a/spektral/utils/convolution.py b/spektral/utils/convolution.py
index 783fcd86..3fdc3014 100644
--- a/spektral/utils/convolution.py
+++ b/spektral/utils/convolution.py
@@ -1,3 +1,5 @@
+import copy
+
 import numpy as np
 from scipy import sparse as sp
 from scipy.sparse.linalg import ArpackNoConvergence
@@ -27,7 +29,7 @@ def degree_power(A, k):
     :return: if A is a dense array, a dense array; if A is sparse, a sparse
     matrix in DIA format.
     """
-    degrees = np.power(np.array(A.sum(1)), k).flatten()
+    degrees = np.power(np.array(A.sum(1)), k).ravel()
     degrees[np.isinf(degrees)] = 0.
     if sp.issparse(A):
         D = sp.diags(degrees)
@@ -46,11 +48,10 @@ def normalized_adjacency(A, symmetric=True):
     """
     if symmetric:
         normalized_D = degree_power(A, -0.5)
-        output = normalized_D.dot(A).dot(normalized_D)
+        return normalized_D.dot(A).dot(normalized_D)
     else:
         normalized_D = degree_power(A, -1.)
-        output = normalized_D.dot(A)
-    return output
+        return normalized_D.dot(A)
 
 
 def laplacian(A):
@@ -109,22 +110,19 @@ def localpooling_filter(A, symmetric=True):
     \(\D^{-\frac{1}{2}}\A\D^{-\frac{1}{2}}\) or as \(\D^{-1}\A\);
     :return: array or sparse matrix with rank 2 or 3, same as A;
     """
-    fltr = A.copy()
-    if sp.issparse(A):
-        I = sp.eye(A.shape[-1], dtype=A.dtype)
-    else:
-        I = np.eye(A.shape[-1], dtype=A.dtype)
-    if A.ndim == 3:
-        for i in range(A.shape[0]):
-            A_tilde = A[i] + I
-            fltr[i] = normalized_adjacency(A_tilde, symmetric=symmetric)
+    out = copy.deepcopy(A)
+    if isinstance(A, list) or (isinstance(A, np.ndarray) and A.ndim == 3):
+        for i in range(len(A)):
+            out[i] = A[i]
+            out[i][np.diag_indices_from(out[i])] += 1
+            out[i] = normalized_adjacency(out[i], symmetric=symmetric)
     else:
-        A_tilde = A + I
-        fltr = normalized_adjacency(A_tilde, symmetric=symmetric)
+        out[np.diag_indices_from(out)] += 1
+        out = normalized_adjacency(out, symmetric=symmetric)
 
-    if sp.issparse(fltr):
-        fltr.sort_indices()
-    return fltr
+    if sp.issparse(out):
+        out.sort_indices()
+    return out
 
 
 def chebyshev_polynomial(X, k):
diff --git a/tests/data/test_utils.py b/tests/data/test_utils.py
index ec34c633..48f07783 100644
--- a/tests/data/test_utils.py
+++ b/tests/data/test_utils.py
@@ -1,22 +1,24 @@
 import numpy as np
 
 from spektral.data import Dataset, Graph
-from spektral.data.utils import numpy_to_disjoint, numpy_to_batch, batch_generator
+from spektral.data.utils import to_disjoint, to_batch, batch_generator
 from spektral.datasets import tud
 
 a_list, x_list, y = tud.load_data('ENZYMES', clean=True)
 
 
-def test_numpy_to_batch():
-    x, a = numpy_to_batch(x_list, a_list)
+def test_to_batch():
+    # TODO test e_list
+    x, a = to_batch(x_list, a_list)
     assert x.ndim == 3
     assert a.ndim == 3
     assert x.shape[0] == a.shape[0]
     assert x.shape[1] == a.shape[1] == a.shape[2]
 
 
-def test_numpy_to_disjoint():
-    x, a, i = numpy_to_disjoint(x_list, a_list)
+def test_to_disjoint():
+    # TODO test e_list
+    x, a, i = to_disjoint(x_list, a_list)
     assert x.ndim == 2
     assert a.ndim == 2
     assert x.shape[0] == a.shape[0] == a.shape[1]
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index 8de43641..3aa3efc3 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -1,4 +1,5 @@
 from spektral.datasets import delaunay, qm9, citation, graphsage, mnist, tud
+from spektral.data import DisjointLoader, BatchLoader
 
 
 def correctly_padded(adj, nf, ef):
@@ -39,13 +40,12 @@ def test_mnist():
 
 
 def test_qm9():
-    adj, nf, ef, labels = qm9.load_data(return_type='numpy', amount=1000)
-    correctly_padded(adj, nf, ef)
-    assert adj.shape[0] == labels.shape[0]
+    dataset = qm9.QM9(amount=100)
+    dl = DisjointLoader(dataset, batch_size=3)
+    dl.__next__()
 
-    # Test that it doesn't crash
-    qm9.load_data(return_type='networkx', amount=1000)
-    qm9.load_data(return_type='sdf', amount=1000)
+    bl = BatchLoader(dataset, batch_size=3)
+    bl.__next__()
 
 
 def test_tud():

From 183215c33bc6b7023e6d1f62beb8132857c476b0 Mon Sep 17 00:00:00 2001
From: Daniele Grattarola <daniele.grattarola@gmail.com>
Date: Fri, 30 Oct 2020 16:05:35 +0100
Subject: [PATCH 04/57] Speed up spektral.utils.misc.pad_jagged_array by up to
 10x

---
 spektral/utils/misc.py | 44 ++++++++++++++++--------------------------
 1 file changed, 17 insertions(+), 27 deletions(-)

diff --git a/spektral/utils/misc.py b/spektral/utils/misc.py
index ed0b82fb..809c266e 100644
--- a/spektral/utils/misc.py
+++ b/spektral/utils/misc.py
@@ -6,35 +6,25 @@ def pad_jagged_array(x, target_shape):
     """
     Given a jagged array of arbitrary dimensions, zero-pads all elements in the
     array to match the provided `target_shape`.
-    :param x: a list or np.array of dtype object, containing np.arrays of
-    varying dimensions
+    :param x: a list or np.array of dtype object, containing np.arrays with
+    variable dimensions;
     :param target_shape: a tuple or list s.t. target_shape[i] >= x.shape[i]
-    for each x in X.
-    If `target_shape[i] = -1`, it will be automatically converted to X.shape[i], 
-    so that passing a target shape of e.g. (-1, n, m) will leave the first 
-    dimension of each element untouched (note that the creation of the output
-    array may fail if the result is again a jagged array).
-    :return: a zero-padded np.array of shape `(X.shape[0], ) + target_shape`
-    """
-    if isinstance(x, list):
-        x = np.array(x, dtype=object)
-
-    for i in range(len(x)):
-        shapes = []
-        for j in range(len(target_shape)):
-            ts = target_shape[j]
-            cs = x[i].shape[j]
-            shapes.append((cs if ts == -1 else ts, cs))
-        if x.ndim == 1:
-            x[i] = np.pad(x[i], [(0, ts - cs) for ts, cs in shapes], 'constant')
-        else:
-            x = np.pad(x, [(0, 0)] + [(0, ts - cs) for ts, cs in shapes], 'constant')
+    for each x in X. If `target_shape[i] = -1`, it will be automatically
+    converted to X.shape[i], so that passing a target shape of e.g. (-1, n, m)
+    will leave the first  dimension of each element untouched.
+    :return: a np.array of shape `(len(x), ) + target_shape`.
+    """
+    if len(x) < 1:
+        raise ValueError('Jagged array cannot be empty')
+    target_len = len(x)
+    target_shape = tuple(shp if shp != -1 else x[0].shape[j]
+                         for j, shp in enumerate(target_shape))
+    output = np.zeros((target_len,) + target_shape, dtype=x[0].dtype)
+    for i in range(target_len):
+        slc = (i,) + tuple(slice(shp) for shp in x[i].shape)
+        output[slc] = x[i]
 
-    dtype = x[0].dtype if len(x) > 0 else None
-    try:
-        return np.array(x, dtype=dtype)
-    except ValueError:
-        return np.array([_ for _ in x], dtype=dtype)
+    return output
 
 
 def add_eye(x):

From 69b6cc4e6dd9ae1081bd4c8cfa480aa7c177d725 Mon Sep 17 00:00:00 2001
From: Daniele Grattarola <daniele.grattarola@gmail.com>
Date: Fri, 30 Oct 2020 17:52:52 +0100
Subject: [PATCH 05/57] Add PackedBatchLoader Change batch_generator to run
 indefinitely unless `epochs` is specified Update examples

---
 .../graph_prediction/ogbg-mol-hiv_disjoint.py |  10 +-
 examples/graph_prediction/qm9_batch.py        |   7 +-
 examples/graph_prediction/qm9_disjoint.py     |  19 ++--
 examples/graph_prediction/tud_disjoint.py     |   5 +-
 .../graph_signal_classification_mnist.py      |   4 +-
 spektral/data/loaders.py                      | 101 ++++++++++--------
 spektral/data/utils.py                        |  11 +-
 tests/data/test_loaders.py                    |  14 +++
 8 files changed, 98 insertions(+), 73 deletions(-)

diff --git a/examples/graph_prediction/ogbg-mol-hiv_disjoint.py b/examples/graph_prediction/ogbg-mol-hiv_disjoint.py
index 656c9c28..e4527f1b 100644
--- a/examples/graph_prediction/ogbg-mol-hiv_disjoint.py
+++ b/examples/graph_prediction/ogbg-mol-hiv_disjoint.py
@@ -86,9 +86,9 @@ def train_step(X_, A_, E_, I_, y_):
 batches_in_epoch = np.ceil(len(A_tr) / batch_size)
 
 print('Fitting model')
-batches_train = batch_generator([X_tr, A_tr, E_tr, y_tr],
-                                batch_size=batch_size, epochs=epochs)
-for b in batches_train:
+batches_tr = batch_generator([X_tr, A_tr, E_tr, y_tr],
+                             batch_size=batch_size, epochs=epochs)
+for b in batches_tr:
     X_, A_, E_, I_ = to_disjoint(*b[:-1])
     A_ = ops.sp_matrix_to_sp_tensor(A_)
     y_ = b[-1]
@@ -107,8 +107,8 @@ def train_step(X_, A_, E_, I_, y_):
 print('Testing model')
 evaluator = Evaluator(name=dataset_name)
 y_pred = []
-batches_test = batch_generator([X_te, A_te, E_te], batch_size=batch_size)
-for b in batches_test:
+batches_te = batch_generator([X_te, A_te, E_te], batch_size=batch_size, epochs=1)
+for b in batches_te:
     X_, A_, E_, I_ = to_disjoint(*b)
     A_ = ops.sp_matrix_to_sp_tensor(A_)
     p = model([X_, A_, E_, I_], training=False)
diff --git a/examples/graph_prediction/qm9_batch.py b/examples/graph_prediction/qm9_batch.py
index acbdd3fb..5c1e1564 100644
--- a/examples/graph_prediction/qm9_batch.py
+++ b/examples/graph_prediction/qm9_batch.py
@@ -4,16 +4,13 @@
 """
 
 import numpy as np
-from sklearn.model_selection import train_test_split
 from tensorflow.keras.layers import Input, Dense
 from tensorflow.keras.models import Model
 from tensorflow.keras.optimizers import Adam
 
 from spektral.data import BatchLoader
-from spektral.datasets import qm9
 from spektral.datasets.qm9 import QM9
 from spektral.layers import EdgeConditionedConv, GlobalSumPool
-from spektral.utils import label_to_one_hot
 
 ################################################################################
 # PARAMETERS
@@ -58,7 +55,7 @@
 ################################################################################
 # FIT MODEL
 ################################################################################
-loader_tr = BatchLoader(dataset_tr, batch_size=batch_size, epochs=epochs)
+loader_tr = BatchLoader(dataset_tr, batch_size=batch_size)
 model.fit(loader_tr,
           steps_per_epoch=loader_tr.steps_per_epoch,
           epochs=epochs)
@@ -68,5 +65,5 @@
 ################################################################################
 print('Testing model')
 loader_te = BatchLoader(dataset_te, batch_size=batch_size)
-model_loss = model.evaluate(loader_te)
+model_loss = model.evaluate(loader_te, steps=loader_tr.steps_per_epoch)
 print('Done. Test loss: {}'.format(model_loss))
diff --git a/examples/graph_prediction/qm9_disjoint.py b/examples/graph_prediction/qm9_disjoint.py
index 13217fa0..0c7fa4f8 100644
--- a/examples/graph_prediction/qm9_disjoint.py
+++ b/examples/graph_prediction/qm9_disjoint.py
@@ -5,18 +5,14 @@
 
 import numpy as np
 import tensorflow as tf
-from sklearn.model_selection import train_test_split
 from tensorflow.keras.layers import Input, Dense
 from tensorflow.keras.losses import MeanSquaredError
 from tensorflow.keras.models import Model
 from tensorflow.keras.optimizers import Adam
 
 from spektral.data import DisjointLoader
-from spektral.datasets import qm9
 from spektral.datasets.qm9 import QM9
-from spektral.layers import EdgeConditionedConv, ops, GlobalSumPool
-from spektral.data.utils import to_disjoint, batch_generator
-from spektral.utils import label_to_one_hot
+from spektral.layers import EdgeConditionedConv, GlobalSumPool
 
 ################################################################################
 # PARAMETERS
@@ -59,6 +55,9 @@
 loss_fn = MeanSquaredError()
 
 
+################################################################################
+# FIT MODEL
+################################################################################
 @tf.function(
     input_signature=((tf.TensorSpec((None, F), dtype=tf.float64),
                       tf.SparseTensorSpec((None, None), dtype=tf.int64),
@@ -76,14 +75,10 @@ def train_step(inputs, target):
     return loss
 
 
-################################################################################
-# FIT MODEL
-################################################################################
+print('Fitting model')
 current_batch = 0
 model_loss = 0
-
-print('Fitting model')
-loader_tr = DisjointLoader(dataset_tr, batch_size=batch_size, epochs=epochs, shuffle=True)
+loader_tr = DisjointLoader(dataset_tr, batch_size=batch_size, epochs=epochs)
 for batch in loader_tr:
     outs = train_step(*batch)
 
@@ -99,7 +94,7 @@ def train_step(inputs, target):
 ################################################################################
 print('Testing model')
 model_loss = 0
-loader_te = DisjointLoader(dataset_te, batch_size=batch_size)
+loader_te = DisjointLoader(dataset_te, batch_size=batch_size, epochs=1)
 for batch in loader_te:
     inputs, target = batch
     predictions = model(inputs, training=False)
diff --git a/examples/graph_prediction/tud_disjoint.py b/examples/graph_prediction/tud_disjoint.py
index f7f557b1..e24d6304 100644
--- a/examples/graph_prediction/tud_disjoint.py
+++ b/examples/graph_prediction/tud_disjoint.py
@@ -101,7 +101,7 @@ def train_step(x_, a_, i_, y_):
 
 print('Fitting model')
 batches_train = batch_generator([x_train, a_train, y_train],
-                                batch_size=batch_size, epochs=epochs)
+                                batch_size=batch_size, epoch=epochs)
 for b in batches_train:
     x_, a_, i_ = to_disjoint(*b[:-1])
     a_ = ops.sp_matrix_to_sp_tensor(a_)
@@ -124,7 +124,8 @@ def train_step(x_, a_, i_, y_):
 print('Testing model')
 model_lss = model_acc = 0
 batches_in_epoch = np.ceil(len(a_test) / batch_size)
-batches_test = batch_generator([x_test, a_test, y_test], batch_size=batch_size)
+batches_test = batch_generator([x_test, a_test, y_test], batch_size=batch_size,
+                               epochs=1)
 for b in batches_test:
     x_, a_, i_ = to_disjoint(*b[:-1])
     a_ = ops.sp_matrix_to_sp_tensor(a_)
diff --git a/examples/other/graph_signal_classification_mnist.py b/examples/other/graph_signal_classification_mnist.py
index 07346d03..a109c1fc 100644
--- a/examples/other/graph_signal_classification_mnist.py
+++ b/examples/other/graph_signal_classification_mnist.py
@@ -100,7 +100,7 @@ def evaluate(x, y):
     results_tr.append((l, a))
 
     if curent_batch == batches_in_epoch:
-        batches_va = batch_generator([x_va, y_va], batch_size=batch_size)
+        batches_va = batch_generator([x_va, y_va], batch_size=batch_size, epochs=1)
         results_va = [evaluate(*batch) for batch in batches_va]
         results_va = np.array(results_va)
         loss_va, acc_va = results_va.mean(0)
@@ -108,7 +108,7 @@ def evaluate(x, y):
             best_val_loss = loss_va
             current_patience = patience
             # Test
-            batches_te = batch_generator([x_te, y_te], batch_size=batch_size)
+            batches_te = batch_generator([x_te, y_te], batch_size=batch_size, epochs=1)
             results_te = [evaluate(*batch) for batch in batches_te]
             results_te = np.array(results_te)
         else:
diff --git a/spektral/data/loaders.py b/spektral/data/loaders.py
index f93ec258..61bf7a3b 100644
--- a/spektral/data/loaders.py
+++ b/spektral/data/loaders.py
@@ -13,14 +13,12 @@
 
 
 class Loader:
-    def __init__(self, dataset, batch_size=1, epochs=1, shuffle=False):
+    def __init__(self, dataset, batch_size=1, epochs=None, shuffle=True):
         self.dataset = dataset
         self.batch_size = batch_size
         self.epochs = epochs
         self.shuffle = shuffle
-        self._generator = batch_generator(
-            self.dataset, batch_size=self.batch_size, epochs=self.epochs,
-            shuffle=self.shuffle)
+        self._generator = self.generator()
         self.steps_per_epoch = int(np.ceil(len(self.dataset) / self.batch_size))
 
     def __iter__(self):
@@ -30,76 +28,93 @@ def __next__(self):
         nxt = self._generator.__next__()
         return self.collate(nxt)
 
-    def tf(self):
+    def generator(self):
+        return batch_generator(self.dataset, batch_size=self.batch_size,
+                               epochs=self.epochs, shuffle=self.shuffle)
+
+    def collate(self, batch):
         raise NotImplementedError
 
-    def collate(self, data_list):
+    def tf(self):
         raise NotImplementedError
 
-    def _pack(self, data_list):
-        return [list(elem) for elem in zip(*[g.numpy() for g in data_list])]
+    def _pack(self, batch):
+        return [list(elem) for elem in zip(*[g.numpy() for g in batch])]
 
 
-class BatchLoader(Loader):
+class DisjointLoader(Loader):
+    def collate(self, batch):
+        packed = self._pack(batch)
+        y = np.array(packed[-1])
+        ret = to_disjoint(*packed[:-1])
+        ret = list(ret)
+        for i in range(len(ret)):
+            if sp.issparse(ret[i]):
+                ret[i] = sp_matrix_to_sp_tensor(ret[i])
+        ret = tuple(ret)
+
+        return ret, y
+
     def tf(self):
         if not tf_loader_available:
             raise RuntimeError('Calling Loader.tf() requires TensorFlow 2.4 '
                                'or greater.')
         signature = copy.deepcopy(self.dataset.signature)
-        for k in signature:
-            signature[k]['shape'] = prepend_none(signature[k]['shape'])
-        if 'a' in signature:
-            # Adjacency matrix in batch mode is dense
-            signature['a']['spec'] = tf.TensorSpec
-        if 'e' in signature:
+        if 'y' in signature:
             # Edge attributes have an extra None dimension in batch mode
-            signature['e']['shape'] = prepend_none(signature['e']['shape'])
+            signature['y']['shape'] = prepend_none(signature['y']['shape'])
+
+        if 'a' in signature:
+            # Adjacency matrix in batch mode is sparse
+            signature['a']['spec'] = tf.SparseTensorSpec
+
+        signature['i'] = dict()
+        signature['i']['spec'] = tf.TensorSpec
+        signature['i']['shape'] = (None, )
+        signature['i']['dtype'] = tf.as_dtype(tf.int64)
 
         return tf.data.Dataset.from_generator(
             lambda: (_ for _ in self),
             output_signature=output_signature(signature)
         )
 
-    def collate(self, data_list):
-        data_packed = self._pack(data_list)
-        y = np.array(data_packed[-1])
-        ret = to_batch(*data_packed[:-1])
 
-        return ret, y
+class BatchLoader(Loader):
+    def collate(self, batch):
+        packed = self._pack(batch)
+        y = np.array(packed[-1])
+        ret = to_batch(*packed[:-1])
 
+        return ret, y
 
-class DisjointLoader(Loader):
     def tf(self):
         if not tf_loader_available:
             raise RuntimeError('Calling Loader.tf() requires TensorFlow 2.4 '
                                'or greater.')
         signature = copy.deepcopy(self.dataset.signature)
-        if 'y' in signature:
-            # Edge attributes have an extra None dimension in batch mode
-            signature['y']['shape'] = prepend_none(signature['y']['shape'])
-
+        for k in signature:
+            signature[k]['shape'] = prepend_none(signature[k]['shape'])
         if 'a' in signature:
-            # Adjacency matrix in batch mode is sparse
-            signature['a']['spec'] = tf.SparseTensorSpec
-
-        signature['i'] = dict()
-        signature['i']['spec'] = tf.TensorSpec
-        signature['i']['shape'] = (None, )
-        signature['i']['dtype'] = tf.as_dtype(tf.int64)
+            # Adjacency matrix in batch mode is dense
+            signature['a']['spec'] = tf.TensorSpec
+        if 'e' in signature:
+            # Edge attributes have an extra None dimension in batch mode
+            signature['e']['shape'] = prepend_none(signature['e']['shape'])
 
         return tf.data.Dataset.from_generator(
             lambda: (_ for _ in self),
             output_signature=output_signature(signature)
         )
 
-    def collate(self, data_list):
-        data_packed = self._pack(data_list)
-        y = np.array(data_packed[-1])
-        ret = to_disjoint(*data_packed[:-1])
-        ret = list(ret)
-        for i in range(len(ret)):
-            if sp.issparse(ret[i]):
-                ret[i] = sp_matrix_to_sp_tensor(ret[i])
-        ret = tuple(ret)
 
-        return ret, y
\ No newline at end of file
+class PackedBatchLoader(BatchLoader):
+    def __init__(self, dataset, batch_size=1, epochs=None, shuffle=True):
+        super().__init__(dataset, batch_size=batch_size, epochs=epochs, shuffle=shuffle)
+        # Drop the Dataset container and work on packed tensors directly
+        self.dataset = self._pack(self.dataset)
+        self.dataset = to_batch(*self.dataset[:-1]) + (np.array(self.dataset[-1]), )
+        # Re-instantiate generator after packing dataset
+        self._generator = self.generator()
+
+    def collate(self, batch):
+        return batch[:-1], batch[-1]
\ No newline at end of file
diff --git a/spektral/data/utils.py b/spektral/data/utils.py
index 735abd34..f2cab63c 100644
--- a/spektral/data/utils.py
+++ b/spektral/data/utils.py
@@ -118,7 +118,7 @@ def to_batch(x_list, a_list, e_list=None):
         return x_out, a_out
 
 
-def batch_generator(data, batch_size=32, epochs=1, shuffle=True):
+def batch_generator(data, batch_size=32, epochs=None, shuffle=True):
     """
     Iterates over the data for the given number of epochs, yielding batches of
     size `batch_size`.
@@ -128,22 +128,25 @@ def batch_generator(data, batch_size=32, epochs=1, shuffle=True):
     :param shuffle: whether to shuffle the data at the beginning of each epoch
     :return: batches of size `batch_size`.
     """
-    if not isinstance(data, list):
+    if not isinstance(data, (list, tuple)):
         data = [data]
     if len(data) < 1:
         raise ValueError('data cannot be empty')
     if len(set([len(item) for item in data])) > 1:
         raise ValueError('All inputs must have the same __len__')
 
+    if epochs is None or epochs == -1:
+        epochs = np.inf
     len_data = len(data[0])
     batches_per_epoch = int(np.ceil(len_data / batch_size))
-    for epochs in range(epochs):
+    epoch = 0
+    while epoch < epochs:
+        epoch += 1
         if shuffle:
             shuffle_inplace(*data)
         for batch in range(batches_per_epoch):
             start = batch * batch_size
             stop = min(start + batch_size, len_data)
-
             to_yield = [item[start:stop] for item in data]
             if len(data) == 1:
                 to_yield = to_yield[0]
diff --git a/tests/data/test_loaders.py b/tests/data/test_loaders.py
index 493eb6c2..4a360f80 100644
--- a/tests/data/test_loaders.py
+++ b/tests/data/test_loaders.py
@@ -3,6 +3,7 @@
 from spektral.data import DisjointLoader, BatchLoader
 from spektral.data.dataset import Dataset
 from spektral.data.graph import Graph
+from spektral.data.loaders import PackedBatchLoader
 
 n_graphs = 10
 ns = np.random.randint(3, 8, n_graphs)
@@ -51,3 +52,16 @@ def test_batch():
     assert a.shape == (graphs_in_batch, n, n)
     assert e.shape == (graphs_in_batch, n, n, s)
     assert y.shape == (graphs_in_batch, 2)
+
+
+def test_fast_batch():
+    data = TestDataset()
+    loader = PackedBatchLoader(data, batch_size=batch_size)
+    batches = [b for b in loader]
+
+    (x, a, e), y = batches[-1]
+    n = max(ns)
+    assert x.shape == (graphs_in_batch, n, f)
+    assert a.shape == (graphs_in_batch, n, n)
+    assert e.shape == (graphs_in_batch, n, n, s)
+    assert y.shape == (graphs_in_batch, 2)

From f9a3f64c416d2d48d616fe7c197cacbba953bb03 Mon Sep 17 00:00:00 2001
From: Daniele Grattarola <daniele.grattarola@gmail.com>
Date: Mon, 2 Nov 2020 15:25:19 +0100
Subject: [PATCH 06/57] Add download method to Dataset Implement TUD as Dataset
 Update example Add Loaders docs

---
 docs/autogen.py                           |  24 ++-
 examples/graph_prediction/tud_disjoint.py |  79 ++++-----
 spektral/data/dataset.py                  |  14 +-
 spektral/data/loaders.py                  |  23 +++
 spektral/datasets/__init__.py             |   2 +-
 spektral/datasets/qm9.py                  |  27 ++-
 spektral/datasets/tudataset.py            | 205 ++++++++++++++++++++++
 spektral/datasets/utils.py                |   3 +
 tests/data/test_utils.py                  |   4 +-
 tests/test_datasets.py                    |  25 ++-
 10 files changed, 326 insertions(+), 80 deletions(-)
 create mode 100644 spektral/datasets/tudataset.py
 create mode 100644 spektral/datasets/utils.py

diff --git a/docs/autogen.py b/docs/autogen.py
index a9d7c306..e3143e1c 100644
--- a/docs/autogen.py
+++ b/docs/autogen.py
@@ -100,11 +100,11 @@
     },
     {
         'page': 'datasets.md',
-        'functions': [
-            datasets.tud.load_data
-        ],
+        'functions': [],
         'methods': [],
-        'classes': []
+        'classes': [
+            datasets.tudataset.TUDataset
+        ]
     },
     {
         'page': 'datasets.md',
@@ -117,11 +117,11 @@
     },
     {
         'page': 'datasets.md',
-        'functions': [
-            datasets.qm9.load_data
-        ],
+        'functions': [],
         'methods': [],
-        'classes': []
+        'classes': [
+            datasets.qm9.QM9
+        ]
     },
     {
         'page': 'datasets.md',
@@ -146,6 +146,14 @@
             spektral.data.utils.to_disjoint,
             spektral.data.utils.to_batch,
             spektral.data.utils.batch_generator
+        ],
+        'classes': [
+            spektral.data.Graph,
+            spektral.data.Dataset,
+            spektral.data.Loader,
+            spektral.data.DisjointLoader,
+            spektral.data.BatchLoader,
+            spektral.data.PackedBatchLoader
         ]
     },
     {
diff --git a/examples/graph_prediction/tud_disjoint.py b/examples/graph_prediction/tud_disjoint.py
index e24d6304..5e6f1357 100644
--- a/examples/graph_prediction/tud_disjoint.py
+++ b/examples/graph_prediction/tud_disjoint.py
@@ -6,16 +6,15 @@
 
 import numpy as np
 import tensorflow as tf
-from sklearn.model_selection import train_test_split
 from tensorflow.keras.layers import Dense, Dropout
 from tensorflow.keras.losses import CategoricalCrossentropy
 from tensorflow.keras.metrics import CategoricalAccuracy
 from tensorflow.keras.models import Model
 from tensorflow.keras.optimizers import Adam
 
-from spektral.datasets import tud
-from spektral.layers import GINConv, GlobalAvgPool, ops
-from spektral.data.utils import to_disjoint, batch_generator
+from spektral.data import DisjointLoader
+from spektral.datasets import tudataset
+from spektral.layers import GINConv, GlobalAvgPool
 
 ################################################################################
 # PARAMETERS
@@ -29,16 +28,16 @@
 ################################################################################
 # LOAD DATA
 ################################################################################
-a, x, y = tud.load_data('PROTEINS', clean=True)
+dataset = tudataset.TUDataset('PROTEINS', clean=True)
 
 # Parameters
-F = x[0].shape[-1]   # Dimension of node features
-n_out = y.shape[-1]  # Dimension of the target
+F = dataset.F          # Dimension of node features
+n_out = dataset.n_out  # Dimension of the target
 
 # Train/test split
-a_train, a_test, \
-x_train, x_test, \
-y_train, y_test = train_test_split(a, x, y, test_size=0.1, random_state=0)
+idxs = np.random.permutation(len(dataset))
+split = int(0.9 * len(dataset))
+dataset_tr, dataset_te = dataset[:split], dataset[split:]
 
 
 ################################################################################
@@ -75,45 +74,39 @@ def call(self, inputs, **kwargs):
 acc_fn = CategoricalAccuracy()
 
 
+################################################################################
+# FIT MODEL
+################################################################################
 @tf.function(
-    input_signature=(tf.TensorSpec((None, F), dtype=tf.float64),
-                     tf.SparseTensorSpec((None, None), dtype=tf.int64),
-                     tf.TensorSpec((None,), dtype=tf.int32),
+    input_signature=((tf.TensorSpec((None, F), dtype=tf.float64),
+                      tf.SparseTensorSpec((None, None), dtype=tf.int64),
+                      tf.TensorSpec((None,), dtype=tf.int64)),
                      tf.TensorSpec((None, n_out), dtype=tf.float64)),
     experimental_relax_shapes=True)
-def train_step(x_, a_, i_, y_):
+def train_step(inputs, target):
     with tf.GradientTape() as tape:
-        predictions = model([x_, a_, i_], training=True)
-        loss = loss_fn(y_, predictions)
+        predictions = model(inputs, training=True)
+        loss = loss_fn(target, predictions)
         loss += sum(model.losses)
     gradients = tape.gradient(loss, model.trainable_variables)
     opt.apply_gradients(zip(gradients, model.trainable_variables))
-    acc = acc_fn(y_, predictions)
+    acc = acc_fn(target, predictions)
     return loss, acc
 
 
-################################################################################
-# FIT MODEL
-################################################################################
+print('Fitting model')
 current_batch = 0
 model_lss = model_acc = 0
-batches_in_epoch = np.ceil(len(a_train) / batch_size)
-
-print('Fitting model')
-batches_train = batch_generator([x_train, a_train, y_train],
-                                batch_size=batch_size, epoch=epochs)
-for b in batches_train:
-    x_, a_, i_ = to_disjoint(*b[:-1])
-    a_ = ops.sp_matrix_to_sp_tensor(a_)
-    y_ = b[-1]
-    lss, acc = train_step(x_, a_, i_, y_)
+loader_tr = DisjointLoader(dataset_tr, batch_size=batch_size, epochs=epochs)
+for batch in loader_tr:
+    lss, acc = train_step(*batch)
 
     model_lss += lss.numpy()
     model_acc += acc.numpy()
     current_batch += 1
-    if current_batch == batches_in_epoch:
-        model_lss /= batches_in_epoch
-        model_acc /= batches_in_epoch
+    if current_batch == loader_tr.steps_per_epoch:
+        model_lss /= loader_tr.steps_per_epoch
+        model_acc /= loader_tr.steps_per_epoch
         print('Loss: {}. Acc: {}'.format(model_lss, model_acc))
         model_lss = model_acc = 0
         current_batch = 0
@@ -123,16 +116,12 @@ def train_step(x_, a_, i_, y_):
 ################################################################################
 print('Testing model')
 model_lss = model_acc = 0
-batches_in_epoch = np.ceil(len(a_test) / batch_size)
-batches_test = batch_generator([x_test, a_test, y_test], batch_size=batch_size,
-                               epochs=1)
-for b in batches_test:
-    x_, a_, i_ = to_disjoint(*b[:-1])
-    a_ = ops.sp_matrix_to_sp_tensor(a_)
-    y_ = b[-1]
-    predictions = model([x_, a_, i_], training=False)
-    model_lss += loss_fn(y_, predictions)
-    model_acc += acc_fn(y_, predictions)
-model_lss /= batches_in_epoch
-model_acc /= batches_in_epoch
+loader_te = DisjointLoader(dataset_te, batch_size=batch_size, epochs=1)
+for batch in loader_te:
+    inputs, target = batch
+    predictions = model(inputs, training=False)
+    model_lss += loss_fn(target, predictions)
+    model_acc += acc_fn(target, predictions)
+model_lss /= loader_te.steps_per_epoch
+model_acc /= loader_te.steps_per_epoch
 print('Done. Test loss: {}. Test acc: {}'.format(model_lss, model_acc))
diff --git a/spektral/data/dataset.py b/spektral/data/dataset.py
index 7084ad0b..7639df2c 100644
--- a/spektral/data/dataset.py
+++ b/spektral/data/dataset.py
@@ -1,11 +1,12 @@
 import copy
+import os.path as osp
 
+import numpy as np
 import tensorflow as tf
 
 from spektral.data import Graph
 from spektral.data.utils import get_spec
-
-import numpy as np
+from spektral.datasets.utils import DATASET_FOLDER
 
 
 class Dataset:
@@ -44,6 +45,8 @@ def read(self):
     implementing a custom Loader for your dataset.
     """
     def __init__(self, **kwargs):
+        if not osp.exists(self.path):
+            self.download()
         self.graphs = self.read()
         # Make sure that we always have at least one graph
         if len(self.graphs) == 0:
@@ -57,9 +60,16 @@ def __init__(self, **kwargs):
         for k, v in kwargs.items():
             setattr(self, k, v)
 
+    @property
+    def path(self):
+        return osp.join(DATASET_FOLDER, self.__class__.__name__)
+
     def read(self):
         raise NotImplementedError
 
+    def download(self):
+        raise NotImplementedError
+
     def _signature(self):
         signature = {}
         graph = self.graphs[0]  # This is always non-empty
diff --git a/spektral/data/loaders.py b/spektral/data/loaders.py
index 61bf7a3b..a7faf4e2 100644
--- a/spektral/data/loaders.py
+++ b/spektral/data/loaders.py
@@ -13,6 +13,15 @@
 
 
 class Loader:
+    """
+    **Arguments**
+
+    - `dataset`: a Dataset object to load.
+    - `batch_size`: size of the mini-batches.
+    - `epochs`: number of epochs to iterate over the datset. By default (`None`)
+    iterates indefinitely.
+    - `shuffle`: whether to shuffle the data at the start of each epoch.
+    """
     def __init__(self, dataset, batch_size=1, epochs=None, shuffle=True):
         self.dataset = dataset
         self.batch_size = batch_size
@@ -43,6 +52,9 @@ def _pack(self, batch):
 
 
 class DisjointLoader(Loader):
+    """
+    A [Loader](https://graphneural.network/) for disjoint mode.
+    """
     def collate(self, batch):
         packed = self._pack(batch)
         y = np.array(packed[-1])
@@ -80,6 +92,9 @@ def tf(self):
 
 
 class BatchLoader(Loader):
+    """
+    A [Loader](https://graphneural.network/) for batch mode.
+    """
     def collate(self, batch):
         packed = self._pack(batch)
         y = np.array(packed[-1])
@@ -108,6 +123,14 @@ def tf(self):
 
 
 class PackedBatchLoader(BatchLoader):
+    """
+    A [Loader](https://graphneural.network/) for batch mode, that pre-pads all
+    graphs to have the same number of nodes.
+    While using more memory than `BatchLoader`, this loader should reduce the
+    overhead due to padding each batch independently.
+    Use this loader if you have graphs of similar sizes and no outliers (i.e.,
+    anomalous graphs with many more nodes than average).
+    """
     def __init__(self, dataset, batch_size=1, epochs=None, shuffle=True):
         super().__init__(dataset, batch_size=batch_size, epochs=epochs, shuffle=shuffle)
         # Drop the Dataset container and work on packed tensors directly
diff --git a/spektral/datasets/__init__.py b/spektral/datasets/__init__.py
index bb7004b4..be4c234b 100644
--- a/spektral/datasets/__init__.py
+++ b/spektral/datasets/__init__.py
@@ -4,4 +4,4 @@
 from . import mnist
 from . import ogb
 from . import qm9
-from . import tud
+from . import tudataset
diff --git a/spektral/datasets/qm9.py b/spektral/datasets/qm9.py
index 20d0ded8..6a7a2aa0 100644
--- a/spektral/datasets/qm9.py
+++ b/spektral/datasets/qm9.py
@@ -1,4 +1,5 @@
 import os
+import os.path as osp
 
 import numpy as np
 import scipy.sparse as sp
@@ -8,9 +9,6 @@
 from spektral.utils import label_to_one_hot
 from spektral.utils.io import load_csv, load_sdf
 
-DATA_PATH = os.path.expanduser('~/.spektral/datasets/qm9/')
-DATASET_URL = 'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/gdb9.tar.gz'
-
 ATOM_TYPES = [1, 6, 7, 8, 9]
 BOND_TYPES = [1, 2, 3, 4]
 
@@ -42,17 +40,20 @@ class QM9(Dataset):
     - `amount`: int, load this many molecules instead of the full dataset
     (useful for debugging).
     """
+    url = 'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/gdb9.tar.gz'
+
     def __init__(self, amount=None, **kwargs):
         self.amount = amount
         super().__init__(**kwargs)
 
-    def read(self):
-        if not os.path.exists(DATA_PATH):
-            _download_data()  # Try to download dataset
+    def download(self):
+        get_file('qm9.tar.gz', self.url, extract=True, cache_dir=self.path,
+                 cache_subdir=self.path)
+        os.remove(osp.join(self.path, 'qm9.tar.gz'))
 
-        # Load molecular graphs
+    def read(self):
         print('Loading QM9 dataset.')
-        sdf_file = os.path.join(DATA_PATH, 'qm9.sdf')
+        sdf_file = osp.join(self.path, 'gdb9.sdf')
         data = load_sdf(sdf_file, amount=self.amount)  # Internal SDF format
 
         x_list, a_list, e_list = [], [], []
@@ -64,7 +65,7 @@ def read(self):
             e_list += [e]
 
         # Load labels
-        labels_file = os.path.join(DATA_PATH, 'qm9.sdf.csv')
+        labels_file = osp.join(self.path, 'gdb9.sdf.csv')
         labels = load_csv(labels_file)
         labels = labels.set_index('mol_id').values[:, 1:]
         if self.amount is not None:
@@ -95,11 +96,3 @@ def mol_to_adj(mol):
     edge_attr = np.array([label_to_one_hot(e, BOND_TYPES)
                           for e in edge_attr])
     return a, edge_attr
-
-
-def _download_data():
-    _ = get_file('qm9.tar.gz', DATASET_URL, extract=True, cache_dir=DATA_PATH,
-                 cache_subdir=DATA_PATH)
-    os.rename(DATA_PATH + 'gdb9.sdf', DATA_PATH + 'qm9.sdf')
-    os.rename(DATA_PATH + 'gdb9.sdf.csv', DATA_PATH + 'qm9.sdf.csv')
-    os.remove(DATA_PATH + 'qm9.tar.gz')
diff --git a/spektral/datasets/tudataset.py b/spektral/datasets/tudataset.py
new file mode 100644
index 00000000..76a2d546
--- /dev/null
+++ b/spektral/datasets/tudataset.py
@@ -0,0 +1,205 @@
+import glob
+import os
+import shutil
+import zipfile
+from os import path as osp
+from urllib.error import URLError
+
+import numpy as np
+import pandas as pd
+import requests
+import scipy.sparse as sp
+from sklearn.preprocessing import OneHotEncoder, StandardScaler
+
+from spektral.data import Dataset, Graph
+from spektral.utils import io
+
+DATA_PATH = osp.expanduser('~/.spektral/datasets/')
+
+
+class TUDataset(Dataset):
+    """
+    The Benchmark Data Sets for Graph Kernels from TU Dortmund
+    ([link](https://ls11-www.cs.tu-dortmund.de/staff/morris/graphkerneldatasets)).
+
+    Node features are computed by concatenating the following features for
+    each node:
+
+    - node attributes, if available;
+    - node labels, if available, one-hot encoded.
+
+    Some datasets might not have node features at all. In this case, attempting
+    to use the dataset with a Loader will result in a crash. In this case,
+    you should set the features manually by iterating over the `graph` list.
+
+    Edge features are computed by concatenating the following features for
+    each node:
+
+    - edge attributes, if available;
+    - edge labels, if available, one-hot encoded.
+
+    Graph labels are provided for each dataset. See the dataset's README in
+    ~/.spektral/datasets/TUD/`name`/ for details about each dataset.
+
+    **Arguments**
+
+    - `name`: str, name of the dataset to load (see `TUD.available_datasets()`).
+    - `clean`: if `True`, rload a version of the dataset with no isomorphic
+               graphs.
+    """
+    url = 'https://ls11-www.cs.tu-dortmund.de/people/morris/graphkerneldatasets'
+    url_clean = 'https://raw.githubusercontent.com/nd7141/graph_datasets/master/datasets'
+
+    def __init__(self, name, clean=False, **kwargs):
+        self.name = name
+        self.clean = clean
+        super().__init__(**kwargs)
+
+    @property
+    def path(self):
+        return osp.join(super(TUDataset, self).path,
+                        self.name + ('_clean' if self.clean else ''))
+
+    def download(self):
+        print('Downloading {} dataset{}.'
+              .format(self.name, ' (clean)' if self.clean else ''))
+
+        url = '{}/{}.zip'.format(self.url_clean if self.clean else self.url, self.name)
+
+        req = requests.get(url)
+        if req.status_code == 404:
+            raise ValueError('Unknown dataset {}. See TUD.available_datasets()'
+                             ' for a list of available datasets.'
+                             .format(self.name))
+
+        os.makedirs(self.path, exist_ok=True)
+        ofname = osp.join(self.path, '{}.zip'.format(self.name))
+        with open(ofname, 'wb') as of:
+            of.write(req.content)
+        with zipfile.ZipFile(ofname, 'r') as of:
+            of.extractall(self.path)
+        os.remove(ofname)
+
+        # TUD datasets are zipped in a folder: unpack them
+        parent = self.path
+        subfolder = osp.join(self.path, self.name)
+        for filename in os.listdir(subfolder):
+            shutil.move(osp.join(subfolder, filename), osp.join(parent, filename))
+        os.rmdir(subfolder)
+
+    def read(self):
+        fname_template = osp.join(self.path, '{}_{{}}.txt'.format(self.name))
+        available = [
+            f.split(os.sep)[-1][len(self.name) + 1:-4]  # Remove leading name
+            for f in glob.glob(fname_template.format('*'))
+        ]
+
+        # Batch index
+        node_batch_index = io.load_txt(fname_template.format('graph_indicator'))\
+                             .astype(int) - 1
+        n_nodes = np.bincount(node_batch_index)
+        n_nodes_cum = np.concatenate(([0], np.cumsum(n_nodes)[:-1]))
+
+        # Adjacency matrix
+        edges = io.load_txt(fname_template.format('A'), delimiter=',').astype(int) - 1
+        # Remove duplicates and self-loops from edges
+        _, mask = np.unique(edges, axis=0, return_index=True)
+        mask = mask[edges[mask, 0] != edges[mask, 1]]
+        edges = edges[mask]
+        # Split edges into separate edge lists
+        edge_batch_idx = node_batch_index[edges[:, 0]]
+        n_edges = np.bincount(edge_batch_idx)
+        n_edges_cum = np.cumsum(n_edges[:-1])
+        edge_lists = np.split(edges - n_nodes_cum[edge_batch_idx, None], n_edges_cum)
+        # Create sparse adjacency matrices
+        a_list = [
+            sp.coo_matrix(
+                (np.ones_like(el[:, 0]), (el[:, 0], el[:, 1])),
+                shape=(n_nodes[i], n_nodes[i])
+            )
+            for i, el in enumerate(edge_lists)
+        ]
+
+        # Node features
+        x_list = []
+        if 'node_attributes' in available:
+            x_attr = io.load_txt(fname_template.format('node_attributes'), delimiter=',')
+            if x_attr.ndim == 1:
+                x_attr = x_attr[:, None]
+            x_list.append(x_attr)
+        if 'node_labels' in available:
+            x_labs = io.load_txt(fname_template.format('node_labels'))
+            if x_labs.ndim == 1:
+                x_labs = x_labs[:, None]
+            x_labs = np.concatenate([
+                _normalize(xl_[:, None], 'ohe') for xl_ in x_labs.T
+            ], -1)
+            x_list.append(x_labs)
+        if len(x_list) > 0:
+            x_list = np.concatenate(x_list, -1)
+            x_list = np.split(x_list, n_nodes_cum[1:])
+        else:
+            print('WARNING: this dataset doesn\'t have node attributes.'
+                  'Consider creating manual features before using it with a '
+                  'Loader.')
+            x_list = [None] * len(n_nodes)
+
+        # Edge features
+        e_list = []
+        if 'edge_attributes' in available:
+            e_attr = io.load_txt(fname_template.format('edge_attributes'))
+            if e_attr.ndim == 1:
+                e_attr = e_attr[:, None]
+            e_attr = e_attr[mask]
+            e_list.append(e_attr)
+        if 'edge_labels' in available:
+            e_labs = io.load_txt(fname_template.format('edge_labels'))
+            if e_labs.ndim == 1:
+                e_labs = e_labs[:, None]
+            e_labs = e_labs[mask]
+            e_labs = np.concatenate([
+                _normalize(el_[:, None], 'ohe') for el_ in e_labs.T
+            ], -1)
+            e_list.append(e_labs)
+        if len(e_list) > 0:
+            e_list = np.concatenate(e_list, -1)
+            e_list = np.split(e_list, n_edges_cum)
+        else:
+            e_list = [None] * len(n_nodes)
+
+        # Labels
+        if 'graph_attributes' in available:
+            labels = io.load_txt(fname_template.format('graph_attributes'))
+        elif 'graph_labels' in available:
+            labels = io.load_txt(fname_template.format('graph_labels'))
+            labels = _normalize(labels[:, None], 'ohe')
+        else:
+            raise ValueError('No labels available for dataset {}'
+                             .format(self.name))
+
+        # Convert to Graph
+        print('Successfully loaded {}.'.format(self.name))
+        return [Graph(x=x, adj=a, edge_attr=e, y=y)
+                for x, a, e, y in zip(x_list, a_list, e_list, labels)]
+
+    def available_datasets(self):
+        try:
+            names = pd.read_html(self.url)[0].Name[2:-1].values.tolist()
+            return [d[:-4] for d in names]
+        except URLError:
+            # No internet, don't panic
+            print('No connection. See {}'.format(self.url))
+            return []
+
+
+def _normalize(x, norm=None):
+    """
+    Apply one-hot encoding or z-score to a list of node features
+    """
+    if norm == 'ohe':
+        fnorm = OneHotEncoder(sparse=False, categories='auto')
+    elif norm == 'zscore':
+        fnorm = StandardScaler()
+    else:
+        return x
+    return fnorm.fit_transform(x)
diff --git a/spektral/datasets/utils.py b/spektral/datasets/utils.py
new file mode 100644
index 00000000..a1335090
--- /dev/null
+++ b/spektral/datasets/utils.py
@@ -0,0 +1,3 @@
+import os.path as osp
+
+DATASET_FOLDER = osp.expanduser('~/.spektral/datasets')
\ No newline at end of file
diff --git a/tests/data/test_utils.py b/tests/data/test_utils.py
index 48f07783..2be165f4 100644
--- a/tests/data/test_utils.py
+++ b/tests/data/test_utils.py
@@ -2,9 +2,9 @@
 
 from spektral.data import Dataset, Graph
 from spektral.data.utils import to_disjoint, to_batch, batch_generator
-from spektral.datasets import tud
+from spektral.datasets import tudataset
 
-a_list, x_list, y = tud.load_data('ENZYMES', clean=True)
+a_list, x_list, y = tudataset.load_data('ENZYMES', clean=True)
 
 
 def test_to_batch():
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index 3aa3efc3..10b98910 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -1,6 +1,8 @@
-from spektral.datasets import delaunay, qm9, citation, graphsage, mnist, tud
+from spektral.datasets import delaunay, qm9, citation, graphsage, mnist, tudataset
 from spektral.data import DisjointLoader, BatchLoader
 
+batch_size = 3
+
 
 def correctly_padded(adj, nf, ef):
     assert adj.ndim == 3
@@ -41,13 +43,26 @@ def test_mnist():
 
 def test_qm9():
     dataset = qm9.QM9(amount=100)
-    dl = DisjointLoader(dataset, batch_size=3)
+    dl = DisjointLoader(dataset, batch_size=batch_size)
     dl.__next__()
 
-    bl = BatchLoader(dataset, batch_size=3)
+    bl = BatchLoader(dataset, batch_size=batch_size)
     bl.__next__()
 
 
 def test_tud():
-    tud.load_data('PROTEINS', clean=False)
-    tud.load_data('ENZYMES', clean=True)
+    # Edge labels + edge attributes
+    dataset = tudataset.TUDataset('BZR_MD', clean=False)
+    dl = DisjointLoader(dataset, batch_size=batch_size)
+    dl.__next__()
+
+    bl = BatchLoader(dataset, batch_size=batch_size)
+    bl.__next__()
+
+    # Node labels + node attributes + clean version
+    dataset = tudataset.TUDataset('ENZYMES', clean=True)
+    dl = DisjointLoader(dataset, batch_size=batch_size)
+    dl.__next__()
+
+    bl = BatchLoader(dataset, batch_size=batch_size)
+    bl.__next__()

From 75ca4846da09e62f409eb59fc313f3563567d211 Mon Sep 17 00:00:00 2001
From: Daniele Grattarola <daniele.grattarola@gmail.com>
Date: Mon, 2 Nov 2020 17:35:13 +0100
Subject: [PATCH 07/57] Implement OGB Dataset wrapper Update examples

---
 .../graph_prediction/ogbg-mol-esol_batch.py   |  60 ++++---
 .../graph_prediction/ogbg-mol-hiv_disjoint.py |  84 +++++----
 spektral/data/__init__.py                     |   2 +-
 spektral/data/dataset.py                      |   2 +-
 spektral/datasets/ogb.py                      |  83 +++------
 spektral/datasets/tud.py                      | 170 ------------------
 6 files changed, 103 insertions(+), 298 deletions(-)
 delete mode 100644 spektral/datasets/tud.py

diff --git a/examples/graph_prediction/ogbg-mol-esol_batch.py b/examples/graph_prediction/ogbg-mol-esol_batch.py
index a03bcbbd..489be31f 100644
--- a/examples/graph_prediction/ogbg-mol-esol_batch.py
+++ b/examples/graph_prediction/ogbg-mol-esol_batch.py
@@ -1,52 +1,53 @@
 """
 This example shows how to perform molecule regression with the
 [Open Graph Benchmark](https://ogb.stanford.edu) `mol-esol` dataset, using a
-simple GIN-based GNN with MinCutPool in batch mode.
+simple GCN with MinCutPool in batch mode.
 Expect unstable training due to the small-ish size of the dataset.
 """
 
+import numpy as np
 from ogb.graphproppred import GraphPropPredDataset, Evaluator
 from tensorflow.keras.callbacks import EarlyStopping
 from tensorflow.keras.layers import Input, Dense
 from tensorflow.keras.models import Model
 from tensorflow.keras.optimizers import Adam
 
-from spektral.datasets import ogb
+from spektral.data import BatchLoader
+from spektral.datasets.ogb import OGB
 from spektral.layers import GraphConv, MinCutPool, GlobalSumPool
-from spektral.utils import pad_jagged_array
 
 ################################################################################
 # PARAMETERS
 ################################################################################
 learning_rate = 1e-3  # Learning rate
-epochs = 99999        # Number of training epochs
+epochs = 10           # Number of training epochs
 batch_size = 32       # Batch size
 
 ################################################################################
 # LOAD DATA
 ################################################################################
 dataset_name = 'ogbg-molesol'
-dataset = GraphPropPredDataset(name=dataset_name)
-n_out = dataset.num_tasks
-N = max(g[0]['num_nodes'] for g in dataset)
+ogb_dataset = GraphPropPredDataset(name=dataset_name)
+dataset = OGB(ogb_dataset)
 
-idx = dataset.get_idx_split()
-tr_idx, va_idx, te_idx = idx["train"], idx["valid"], idx["test"]
+# Parameters
+N = max(g.N for g in dataset)
+F = dataset.F          # Dimension of node features
+S = dataset.S          # Dimension of edge features
+n_out = dataset.n_out  # Dimension of the target
 
-X, A, _, y = ogb.dataset_to_numpy(dataset, dtype='f8')
-A = [a.toarray() for a in A]
-F = X[0].shape[-1]
-X = pad_jagged_array(X, (N, F))
-A = pad_jagged_array(A, (N, N))
-X_tr, A_tr, y_tr = X[tr_idx], A[tr_idx], y[tr_idx]
-X_va, A_va, y_va = X[va_idx], A[va_idx], y[va_idx]
-X_te, A_te, y_te = X[te_idx], A[te_idx], y[te_idx]
+# Train/test split
+idx = ogb_dataset.get_idx_split()
+tr_idx, va_idx, te_idx = idx["train"], idx["valid"], idx["test"]
+dataset_tr = dataset[tr_idx]
+dataset_va = dataset[va_idx]
+dataset_te = dataset[te_idx]
 
 ################################################################################
 # BUILD MODEL
 ################################################################################
-X_in = Input(shape=(N, F))
-A_in = Input(shape=(N, N))
+X_in = Input(shape=(None, F))
+A_in = Input(shape=(None, None))
 
 X_1 = GraphConv(32, activation='relu')([X_in, A_in])
 X_1, A_1 = MinCutPool(N // 2)([X_1, A_in])
@@ -63,19 +64,22 @@
 ################################################################################
 # FIT MODEL
 ################################################################################
-model.fit([X_tr, A_tr],
-          y_tr,
-          batch_size=batch_size,
-          validation_data=([X_va, A_va], y_va),
-          callbacks=[EarlyStopping(patience=200, restore_best_weights=True)],
-          epochs=epochs)
+loader_tr = BatchLoader(dataset_tr, batch_size=batch_size)
+loader_va = BatchLoader(dataset_va, batch_size=batch_size)
+model.fit(loader_tr,
+          steps_per_epoch=loader_tr.steps_per_epoch,
+          epochs=epochs,
+          validation_data=loader_va,
+          validation_steps=loader_va.steps_per_epoch,
+          callbacks=[EarlyStopping(patience=10, restore_best_weights=True)])
 
 ################################################################################
 # EVALUATE MODEL
 ################################################################################
 print('Testing model')
 evaluator = Evaluator(name=dataset_name)
-y_pred = model.predict([X_te, A_te], batch_size=batch_size)
-ogb_score = evaluator.eval({'y_true': y_te, 'y_pred': y_pred})
-
+loader_te = BatchLoader(dataset_te, batch_size=batch_size, epochs=1)
+y_pred = model.predict(loader_te, batch_size=batch_size)
+y_true = np.vstack([g.y for g in dataset_te])
+ogb_score = evaluator.eval({'y_true': y_true, 'y_pred': y_pred})
 print('Done. RMSE: {:.4f}'.format(ogb_score['rmse']))
diff --git a/examples/graph_prediction/ogbg-mol-hiv_disjoint.py b/examples/graph_prediction/ogbg-mol-hiv_disjoint.py
index e4527f1b..44cc1618 100644
--- a/examples/graph_prediction/ogbg-mol-hiv_disjoint.py
+++ b/examples/graph_prediction/ogbg-mol-hiv_disjoint.py
@@ -14,9 +14,9 @@
 from tensorflow.keras.models import Model
 from tensorflow.keras.optimizers import Adam
 
-from spektral.datasets import ogb
-from spektral.layers import EdgeConditionedConv, ops, GlobalSumPool
-from spektral.data.utils import to_disjoint, batch_generator
+from spektral.data import DisjointLoader
+from spektral.datasets.ogb import OGB
+from spektral.layers import EdgeConditionedConv, GlobalSumPool
 
 ################################################################################
 # PARAMETERS
@@ -29,18 +29,20 @@
 # LOAD DATA
 ################################################################################
 dataset_name = 'ogbg-molhiv'
-dataset = GraphPropPredDataset(name=dataset_name)
-n_out = dataset.num_tasks
+ogb_dataset = GraphPropPredDataset(name=dataset_name)
+dataset = OGB(ogb_dataset)
 
-idx = dataset.get_idx_split()
-tr_idx, va_idx, te_idx = idx["train"], idx["valid"], idx["test"]
-
-X_tr, A_tr, E_tr, y_tr = ogb.dataset_to_numpy(dataset, tr_idx, dtype='f8')
-X_va, A_va, E_va, y_va = ogb.dataset_to_numpy(dataset, va_idx, dtype='f8')
-X_te, A_te, E_te, y_te = ogb.dataset_to_numpy(dataset, te_idx, dtype='f8')
+# Parameters
+F = dataset.F          # Dimension of node features
+S = dataset.S          # Dimension of edge features
+n_out = dataset.n_out  # Dimension of the target
 
-F = X_tr[0].shape[-1]
-S = E_tr[0].shape[-1]
+# Train/test split
+idx = ogb_dataset.get_idx_split()
+tr_idx, va_idx, te_idx = idx["train"], idx["valid"], idx["test"]
+dataset_tr = dataset[tr_idx]
+dataset_va = dataset[va_idx]
+dataset_te = dataset[te_idx]
 
 ################################################################################
 # BUILD MODEL
@@ -61,43 +63,37 @@
 loss_fn = BinaryCrossentropy()
 
 
+################################################################################
+# FIT MODEL
+################################################################################
 @tf.function(
-    input_signature=(tf.TensorSpec((None, F), dtype=tf.float64),
-                     tf.SparseTensorSpec((None, None), dtype=tf.float64),
-                     tf.TensorSpec((None, S), dtype=tf.float64),
-                     tf.TensorSpec((None,), dtype=tf.int32),
+    input_signature=((tf.TensorSpec((None, F), dtype=tf.float64),
+                      tf.SparseTensorSpec((None, None), dtype=tf.int64),
+                      tf.TensorSpec((None, S), dtype=tf.float64),
+                      tf.TensorSpec((None,), dtype=tf.int64)),
                      tf.TensorSpec((None, n_out), dtype=tf.float64)),
     experimental_relax_shapes=True)
-def train_step(X_, A_, E_, I_, y_):
+def train_step(inputs, target):
     with tf.GradientTape() as tape:
-        predictions = model([X_, A_, E_, I_], training=True)
-        loss = loss_fn(y_, predictions)
+        predictions = model(inputs, training=True)
+        loss = loss_fn(target, predictions)
         loss += sum(model.losses)
     gradients = tape.gradient(loss, model.trainable_variables)
     opt.apply_gradients(zip(gradients, model.trainable_variables))
     return loss
 
 
-################################################################################
-# FIT MODEL
-################################################################################
+print('Fitting model')
 current_batch = 0
 model_loss = 0
-batches_in_epoch = np.ceil(len(A_tr) / batch_size)
+loader_tr = DisjointLoader(dataset_tr, batch_size=batch_size, epochs=epochs)
+for batch in loader_tr:
+    outs = train_step(*batch)
 
-print('Fitting model')
-batches_tr = batch_generator([X_tr, A_tr, E_tr, y_tr],
-                             batch_size=batch_size, epochs=epochs)
-for b in batches_tr:
-    X_, A_, E_, I_ = to_disjoint(*b[:-1])
-    A_ = ops.sp_matrix_to_sp_tensor(A_)
-    y_ = b[-1]
-    outs = train_step(X_, A_, E_, I_, y_)
-
-    model_loss += outs.numpy()
+    model_loss += outs
     current_batch += 1
-    if current_batch == batches_in_epoch:
-        print('Loss: {}'.format(model_loss / batches_in_epoch))
+    if current_batch == loader_tr.steps_per_epoch:
+        print('Loss: {}'.format(model_loss / loader_tr.steps_per_epoch))
         model_loss = 0
         current_batch = 0
 
@@ -106,17 +102,19 @@ def train_step(X_, A_, E_, I_, y_):
 ################################################################################
 print('Testing model')
 evaluator = Evaluator(name=dataset_name)
+y_true = []
 y_pred = []
-batches_te = batch_generator([X_te, A_te, E_te], batch_size=batch_size, epochs=1)
-for b in batches_te:
-    X_, A_, E_, I_ = to_disjoint(*b)
-    A_ = ops.sp_matrix_to_sp_tensor(A_)
-    p = model([X_, A_, E_, I_], training=False)
+loader_te = DisjointLoader(dataset_te, batch_size=batch_size, epochs=1)
+for batch in loader_te:
+    inputs, target = batch
+    p = model(inputs, training=False)
+    y_true.append(target)
     y_pred.append(p.numpy())
 
+y_true = np.vstack(y_true)
 y_pred = np.vstack(y_pred)
-model_loss = loss_fn(y_te, y_pred)
-ogb_score = evaluator.eval({'y_true': y_te, 'y_pred': y_pred})
+model_loss = loss_fn(y_true, y_pred)
+ogb_score = evaluator.eval({'y_true': y_true, 'y_pred': y_pred})
 
 print('Done. Test loss: {:.4f}. ROC-AUC: {:.2f}'
       .format(model_loss, ogb_score['rocauc']))
diff --git a/spektral/data/__init__.py b/spektral/data/__init__.py
index 7b52a983..c11741c3 100644
--- a/spektral/data/__init__.py
+++ b/spektral/data/__init__.py
@@ -1,3 +1,3 @@
 from .graph import Graph
 from .dataset import Dataset
-from .loaders import Loader, BatchLoader, DisjointLoader
+from .loaders import Loader, BatchLoader, DisjointLoader, PackedBatchLoader
diff --git a/spektral/data/dataset.py b/spektral/data/dataset.py
index 7639df2c..ee6ce913 100644
--- a/spektral/data/dataset.py
+++ b/spektral/data/dataset.py
@@ -101,7 +101,7 @@ def _signature(self):
 
     def __getitem__(self, key):
         if not (np.issubdtype(type(key), np.integer) or
-                isinstance(key, (slice, list, tuple))):
+                isinstance(key, (slice, list, tuple, np.ndarray))):
             raise ValueError('Unsupported key type: {}'.format(type(key)))
         if np.issubdtype(type(key), np.integer):
             return self.graphs[int(key)]
diff --git a/spektral/datasets/ogb.py b/spektral/datasets/ogb.py
index 55bad607..fa6749ae 100644
--- a/spektral/datasets/ogb.py
+++ b/spektral/datasets/ogb.py
@@ -1,63 +1,36 @@
-import scipy.sparse as sp
 import numpy as np
+import scipy.sparse as sp
 
+from spektral.data import Dataset, Graph
 
-def graph_to_numpy(graph, dtype=None):
-    """
-    Converts a graph in OGB's library-agnostic format to a representation in
-    Numpy/Scipy. See the [Open Graph Benchmark's website](https://ogb.stanford.edu)
-    for more information.
-    :param graph: OGB library-agnostic graph;
-    :param dtype: if set, all output arrays will be cast to this dtype.
-    :return:
-        - X: np.array of shape (N, F) with the node features;
-        - A: scipy.sparse adjacency matrix of shape (N, N) in COOrdinate format;
-        - E: if edge features are available, np.array of shape (n_edges, S),
-            `None` otherwise.
+
+class OGB(Dataset):
     """
-    N = graph['num_nodes']
-    X = graph['node_feat']
-    if X is not None:
-        X = X.astype(dtype)
-    row, col = graph['edge_index']
-    A = sp.coo_matrix((np.ones_like(row), (row, col)), shape=(N, N)).astype(dtype)
-    E = graph['edge_feat']
-    if E is not None:
-        E = E.astype(dtype)
+    Wrapper for OGB datasets.
 
-    return X, A, E
+    **Arguments**
 
+    - `dataset`: an OGB library-agnostic Graph*Dataset object.
 
-def dataset_to_numpy(dataset, indices=None, dtype=None):
-    """
-    Converts a dataset in OGB's library-agnostic version to lists of Numpy/Scipy
-    arrays. See the [Open Graph Benchmark's website](https://ogb.stanford.edu)
-    for more information.
-    :param dataset: OGB library-agnostic dataset (e.g., GraphPropPredDataset);
-    :param indices: optional, a list of integer indices; if provided, only these
-    graphs will be converted;
-    :param dtype: if set, the arrays in the returned lists will have this dtype.
-    :return:
-        - X_list: list of np.arrays of (variable) shape (N, F) with node features;
-        - A_list: list of scipy.sparse adjacency matrices of (variable) shape
-        (N, N);
-        - E_list: list of np.arrays of (variable) shape (n_nodes, S) with edge
-        attributes. If edge attributes are not available, a list of None.
-        - y_list: np.array of shape (n_graphs, n_tasks) with the task labels;
     """
-    X_list = []
-    A_list = []
-    E_list = []
-    y_list = []
-    if indices is None:
-        indices = range(len(dataset))
-
-    for i in indices:
-        graph, label = dataset[int(i)]
-        X, A, E = graph_to_numpy(graph, dtype=dtype)
-        X_list.append(X)
-        A_list.append(A)
-        E_list.append(E)
-        y_list.append(label)
-
-    return X_list, A_list, E_list, np.array(y_list)
+    def __init__(self, dataset, **kwargs):
+        self.dataset = dataset
+        super().__init__(**kwargs)
+
+    def read(self):
+        return [Graph(*_elem_to_numpy(elem)) for elem in self.dataset]
+
+    def download(self):
+        # Download is handled by OGB
+        pass
+
+
+def _elem_to_numpy(elem):
+    graph, label = elem
+    n = graph['num_nodes']
+    x = graph['node_feat']
+    row, col = graph['edge_index']
+    a = sp.coo_matrix((np.ones_like(row), (row, col)), shape=(n, n))
+    e = graph['edge_feat']
+
+    return x, a, e, label
diff --git a/spektral/datasets/tud.py b/spektral/datasets/tud.py
deleted file mode 100644
index c7335683..00000000
--- a/spektral/datasets/tud.py
+++ /dev/null
@@ -1,170 +0,0 @@
-import glob
-import os
-import shutil
-import zipfile
-from os import path as osp
-from urllib.error import URLError
-
-import numpy as np
-import pandas as pd
-import requests
-import scipy.sparse as sp
-from sklearn.preprocessing import OneHotEncoder, StandardScaler
-
-from spektral.utils import io
-
-DATASET_URL = 'https://ls11-www.cs.tu-dortmund.de/people/morris/graphkerneldatasets'
-DATASET_CLEAN_URL = 'https://raw.githubusercontent.com/nd7141/graph_datasets/master/datasets'
-DATA_PATH = osp.expanduser('~/.spektral/datasets/')
-
-
-def load_data(dataset_name, clean=False):
-    """
-    Loads one of the Benchmark Data Sets for Graph Kernels from TU Dortmund
-    ([link](https://ls11-www.cs.tu-dortmund.de/staff/morris/graphkerneldatasets)).
-    The node features are computed by concatenating the following features for
-    each node:
-
-    - node attributes, if available, normalized as specified in `normalize_features`;
-    - clustering coefficient, normalized with z-score;
-    - node degrees, normalized as specified in `normalize_features`;
-    - node labels, if available, one-hot encoded.
-    :param dataset_name: name of the dataset to load (see `spektral.datasets.tud.AVAILABLE_DATASETS`).
-    :param clean: if True, return a version of the dataset with no isomorphic
-    graphs.
-    :return:
-    - a list of adjacency matrices;
-    - a list of node feature matrices;
-    - a numpy array containing the one-hot encoded targets.
-    """
-    if clean:
-        dataset_name += '_clean'
-    if not osp.exists(DATA_PATH + dataset_name):
-        _download_data(dataset_name)
-
-    # Read data
-    A_list, X_list, y = _read_graphs(dataset_name)
-
-    print('Successfully loaded {}.'.format(dataset_name))
-
-    return A_list, X_list, y
-
-
-def available_datasets():
-    try:
-        return [
-            d[:-4]
-            for d in pd.read_html(DATASET_URL)[0].Name[2:-1].values.tolist()
-        ]
-    except URLError:
-        # No internet, don't panic
-        print('No connection. See {}'.format(DATASET_URL))
-        return []
-
-
-def _read_graphs(dataset_name):
-    file_prefix = osp.join(DATA_PATH, dataset_name, dataset_name)
-    available = [
-        f.split(os.sep)[-1][len(dataset_name)+1:-4]
-        for f in glob.glob('{}_*.txt'.format(file_prefix))
-    ]
-
-    I = io.load_txt(file_prefix + '_graph_indicator.txt').astype(int) - 1
-    unique_ids = np.unique(I)
-    num_graphs = len(unique_ids)
-    graph_sizes = np.bincount(I)
-    offsets = np.concatenate(([0], np.cumsum(graph_sizes)[:-1]))
-    edges = io.load_txt(file_prefix + '_A.txt', delimiter=',').astype(int) - 1
-
-    A_list = [[] for _ in range(num_graphs)]
-    for e in edges:
-        graph_id = I[e[0]]
-        A_list[graph_id].append(e - offsets[graph_id])
-    A_list = map(np.array, A_list)
-    A_list = [
-        sp.coo_matrix(
-            (np.ones_like(A[:, 0]), (A[:, 0], A[:, 1])),
-            shape=(graph_sizes[i], graph_sizes[i])
-        )
-        for i, A in enumerate(A_list)
-    ]
-
-    X = []
-    if 'node_attributes' in available:
-        X_na = io.load_txt(file_prefix + '_node_attributes.txt', delimiter=',')
-        if X_na.ndim == 1:
-            X_na = X_na[:, None]
-        X.append(X_na)
-    if 'node_labels' in available:
-        X_nl = io.load_txt(file_prefix + '_node_labels.txt')
-        X_nl = _normalize(X_nl.reshape(-1, 1), 'ohe')
-        X.append(X_nl)
-    if len(X) > 0:
-        X = np.concatenate(X, -1)
-
-    X_list = []
-    start = offsets[0]
-    for i in range(num_graphs):
-        stop = offsets[i + 1] if i + 1 < len(offsets) else None
-        X_list.append(X[start:stop])
-        start = stop
-
-    y = None
-    if 'graph_attributes' in available:
-        y = io.load_txt(file_prefix + '_graph_attributes.txt')
-    elif 'graph_labels' in available:
-        y = io.load_txt(file_prefix + '_graph_labels.txt')
-        y = _normalize(y[:, None], 'ohe')
-
-    return A_list, X_list, y
-
-
-def _download_data(dataset_name):
-    print('Dowloading ' + dataset_name + ' dataset.')
-    if dataset_name.endswith('_clean'):
-        true_name = dataset_name[:-6]
-        url = DATASET_CLEAN_URL
-    else:
-        true_name = dataset_name
-        url = DATASET_URL
-
-    data_url = '{}/{}.zip'.format(url, true_name)
-    req = requests.get(data_url)
-    if req.status_code == 404:
-        raise ValueError('Unknown dataset {}. See spektral.datasets.tud.available_datasets()'
-                         ' for a list of available datasets.'
-                         .format(dataset_name))
-
-    os.makedirs(DATA_PATH, exist_ok=True)
-    with open(DATA_PATH + dataset_name + '.zip', 'wb') as out_file:
-        out_file.write(req.content)
-    with zipfile.ZipFile(DATA_PATH + dataset_name + '.zip', 'r') as zip_ref:
-        zip_ref.extractall(DATA_PATH + dataset_name + '/')
-    os.remove(DATA_PATH + dataset_name + '.zip')
-
-    subfolder = osp.join(DATA_PATH, dataset_name, true_name)
-    parentfolder = osp.join(DATA_PATH, dataset_name)
-    for filename in os.listdir(subfolder):
-        try:
-            suffix = filename.split(true_name)[1]
-        except IndexError:
-            # Probably the README
-            continue
-        shutil.move(
-            osp.join(subfolder, filename),
-            osp.join(parentfolder, dataset_name + suffix)
-        )
-    shutil.rmtree(subfolder)
-
-
-def _normalize(x, norm=None):
-    """
-    Apply one-hot encoding or z-score to a list of node features
-    """
-    if norm == 'ohe':
-        fnorm = OneHotEncoder(sparse=False, categories='auto')
-    elif norm == 'zscore':
-        fnorm = StandardScaler()
-    else:
-        return x
-    return fnorm.fit_transform(x)

From 9652161585296e25a504367045e84dfebda8affe Mon Sep 17 00:00:00 2001
From: Daniele Grattarola <daniele.grattarola@gmail.com>
Date: Tue, 3 Nov 2020 10:46:30 +0100
Subject: [PATCH 08/57] Add support for Dataset transforms Add Dataset.apply
 and Dataset.map methods Add example for creating custom dataset Add Degree
 and MaxDegree transforms Rename localpooling_filter to gcn_filter Fix issue
 with train/test split in examples

---
 docs/autogen.py                               |   2 +-
 .../{BDGC_disjoint.py => custom_dataset.py}   | 179 +++++++++++-------
 .../graph_prediction/ogbg-mol-esol_batch.py   |   8 +-
 .../graph_prediction/ogbg-mol-hiv_disjoint.py |   8 +-
 examples/graph_prediction/qm9_batch.py        |   3 +-
 examples/graph_prediction/qm9_disjoint.py     |   4 +-
 examples/graph_prediction/tud_disjoint.py     |   3 +-
 .../node_prediction/citation_simple_gc.py     |   4 +-
 spektral/data/dataset.py                      |  26 ++-
 spektral/datasets/ogb.py                      |   4 -
 spektral/layers/convolutional/appnp.py        |   2 +-
 spektral/layers/convolutional/graph_conv.py   |   6 +-
 spektral/transforms/degree.py                 |  23 +++
 spektral/utils/convolution.py                 |   2 +-
 spektral/utils/misc.py                        | 114 +++++------
 15 files changed, 229 insertions(+), 159 deletions(-)
 rename examples/graph_prediction/{BDGC_disjoint.py => custom_dataset.py} (50%)
 create mode 100644 spektral/transforms/degree.py

diff --git a/docs/autogen.py b/docs/autogen.py
index e3143e1c..a080fd5c 100644
--- a/docs/autogen.py
+++ b/docs/autogen.py
@@ -165,7 +165,7 @@
             utils.convolution.laplacian,
             utils.convolution.normalized_laplacian,
             utils.convolution.rescale_laplacian,
-            utils.convolution.localpooling_filter,
+            utils.convolution.gcn_filter,
             utils.convolution.chebyshev_polynomial,
             utils.convolution.chebyshev_filter
         ],
diff --git a/examples/graph_prediction/BDGC_disjoint.py b/examples/graph_prediction/custom_dataset.py
similarity index 50%
rename from examples/graph_prediction/BDGC_disjoint.py
rename to examples/graph_prediction/custom_dataset.py
index 01ebfd34..4773b44b 100644
--- a/examples/graph_prediction/BDGC_disjoint.py
+++ b/examples/graph_prediction/custom_dataset.py
@@ -1,15 +1,23 @@
 """
-This example shows how to perform graph classification with a synthetic
-benchmark dataset created by F. M. Bianchi (https://github.com/FilippoMB/Benchmark_dataset_for_graph_classification),
-using a GNN with convolutional and pooling blocks in disjoint mode.
-This is a more advanced example that also shows how to do validation and early
-stopping. For a beginner-level example, see qm9_disjoint.py.
-"""
+This example shows how to define your own dataset and use it to train a
+non-trivial GNN with message-passing and pooling layers.
+The script also shows how to implement fast training and evaluation functions
+in disjoint mode, with early stopping and accuracy monitoring.
+
+The dataset that we create is a simple synthetic task in which we have random
+graphs with randomly-colored nodes. The goal is to classify each graph with the
+color that occurs the most on its nodes. For example, given a graph with 2
+colors and 3 nodes:
 
-import os
+x = [[1, 0],
+     [1, 0],
+     [0, 1]],
+
+the corresponding target will be [1, 0].
+"""
 
+import networkx as nx
 import numpy as np
-import requests
 import tensorflow as tf
 from tensorflow.keras.layers import Input, Dense
 from tensorflow.keras.losses import CategoricalCrossentropy
@@ -17,63 +25,82 @@
 from tensorflow.keras.models import Model
 from tensorflow.keras.optimizers import Adam
 
+from spektral.data import Dataset, Graph, DisjointLoader
 from spektral.layers import GraphConvSkip, GlobalAvgPool
-from spektral.layers import ops
 from spektral.layers.pooling import TopKPool
 from spektral.utils.convolution import normalized_adjacency
-from spektral.data.utils import to_disjoint, batch_generator
-
-
-def evaluate(A_list, X_list, y_list, ops_list, batch_size):
-    batches = batch_generator([X_list, A_list, y_list], batch_size=batch_size)
-    output = []
-    for b in batches:
-        X, A, I = to_disjoint(*b[:-1])
-        A = ops.sp_matrix_to_sp_tensor(A)
-        y = b[-1]
-        pred = model([X, A, I], training=False)
-        outs = [o(y, pred) for o in ops_list]
-        output.append(outs)
-    return np.mean(output, 0)
-
 
 ################################################################################
 # PARAMETERS
 ################################################################################
 learning_rate = 1e-3       # Learning rate
 epochs = 500               # Number of training epochs
-es_patience = 50           # Patience for early stopping
+es_patience = 10           # Patience for early stopping
 batch_size = 16            # Batch size
-data_url = 'https://github.com/FilippoMB/Benchmark_dataset_for_graph_classification/raw/master/datasets/'
-dataset_name = 'easy.npz'  # Dataset ('easy.npz' or 'hard.npz')
+
 
 ################################################################################
 # LOAD DATA
 ################################################################################
-# Download graph classification data
-if not os.path.exists(dataset_name):
-    print('Downloading ' + dataset_name + ' from ' + data_url)
-    req = requests.get(data_url + dataset_name)
-    with open(dataset_name, 'wb') as out_file:
-        out_file.write(req.content)
-
-# Load data
-loaded = np.load(dataset_name, allow_pickle=True)
-X_train, A_train, y_train = loaded['tr_feat'], list(loaded['tr_adj']), loaded['tr_class']
-X_test, A_test, y_test = loaded['te_feat'], list(loaded['te_adj']), loaded['te_class']
-X_val, A_val, y_val = loaded['val_feat'], list(loaded['val_adj']), loaded['val_class']
+class MyDataset(Dataset):
+    """
+    A dataset of random colored graphs.
+    The task is to classify each graph with the color which occurs the most in
+    its nodes.
+    The graphs have `n_colors` colors, of at least `n_min` and at most `n_max`
+    nodes connected with probability `p`.
+    """
+    def __init__(self, n_graphs, n_colors=3, n_min=10, n_max=100, p=0.5, **kwargs):
+        self.n_graphs = n_graphs
+        self.n_colors = n_colors
+        self.n_min = n_min
+        self.n_max = n_max
+        self.p = p
+        super().__init__(**kwargs)
+
+    def read(self):
+        def make_graph():
+            n = np.random.randint(self.n_min, self.n_max)
+            colors = np.random.randint(0, self.n_colors, size=n)
+
+            # Node features
+            x = np.zeros((n, self.n_colors))
+            x[np.arange(n), colors] = 1
+
+            # Edges
+            a = nx.adj_matrix(nx.generators.gnp_random_graph(n, self.p))
+
+            # Labels
+            y = np.zeros((self.n_colors, ))
+            color_counts = x.sum(0)
+            y[np.argmax(color_counts)] = 1
+
+            return Graph(x=x, adj=a, y=y)
+
+        # We must return a list of Graph objects
+        return [make_graph() for _ in range(self.n_graphs)]
+
+
+dataset = MyDataset(1000)
+
+# Parameters
+F = dataset.F          # Dimension of node features
+n_out = dataset.n_out  # Dimension of the target
 
 # Preprocessing
-A_train = [normalized_adjacency(a) for a in A_train]
-A_val = [normalized_adjacency(a) for a in A_val]
-A_test = [normalized_adjacency(a) for a in A_test]
+for g in dataset.graphs:
+    g.adj = normalized_adjacency(g.adj)
 
-# Parameters
-F = X_train[0].shape[-1]  # Dimension of node features
-n_out = y_train[0].shape[-1]  # Dimension of the target
+# Train/valid/test split
+idxs = np.random.permutation(len(dataset))
+split_va, split_te = int(0.8 * len(dataset)), int(0.9 * len(dataset))
+idx_tr, idx_va, idx_te = np.split(idxs, [split_va, split_te])
+dataset_tr = dataset[idx_tr]
+dataset_va = dataset[idx_va]
+dataset_te = dataset[idx_te]
 
 ################################################################################
-# BUILD MODEL
+# BUILD (unnecessarily big) MODEL
 ################################################################################
 X_in = Input(shape=(F, ), name='X_in')
 A_in = Input(shape=(None,), sparse=True)
@@ -94,26 +121,39 @@ def evaluate(A_list, X_list, y_list, ops_list, batch_size):
 acc_fn = CategoricalAccuracy()
 
 
+################################################################################
+# FIT MODEL
+################################################################################
 @tf.function(
-    input_signature=(tf.TensorSpec((None, F), dtype=tf.float64),
-                     tf.SparseTensorSpec((None, None), dtype=tf.float32),
-                     tf.TensorSpec((None,), dtype=tf.int32),
+    input_signature=((tf.TensorSpec((None, F), dtype=tf.float64),
+                      tf.SparseTensorSpec((None, None), dtype=tf.float64),
+                      tf.TensorSpec((None,), dtype=tf.int64)),
                      tf.TensorSpec((None, n_out), dtype=tf.float64)),
     experimental_relax_shapes=True)
-def train_step(X_, A_, I_, y_):
+def train_step(inputs, target):
     with tf.GradientTape() as tape:
-        predictions = model([X_, A_, I_], training=True)
-        loss = loss_fn(y_, predictions)
+        predictions = model(inputs, training=True)
+        loss = loss_fn(target, predictions)
         loss += sum(model.losses)
-        acc = acc_fn(y_, predictions)
+        acc = acc_fn(target, predictions)
     gradients = tape.gradient(loss, model.trainable_variables)
     opt.apply_gradients(zip(gradients, model.trainable_variables))
     return loss, acc
 
 
-################################################################################
-# FIT MODEL
-################################################################################
+def evaluate(loader, ops_list):
+    output = []
+    step = 0
+    while step < loader.steps_per_epoch:
+        step += 1
+        inputs, target = loader.__next__()
+        pred = model(inputs, training=False)
+        outs = [o(target, pred) for o in ops_list]
+        output.append(outs)
+    return np.mean(output, 0)
+
+
+print('Fitting model')
 current_batch = 0
 epoch = 0
 model_loss = 0
@@ -121,28 +161,22 @@ def train_step(X_, A_, I_, y_):
 best_val_loss = np.inf
 best_weights = None
 patience = es_patience
-batches_in_epoch = np.ceil(y_train.shape[0] / batch_size)
 
-print('Fitting model')
-batches = batch_generator([X_train, A_train, y_train],
-                          batch_size=batch_size, epochs=epochs)
-for b in batches:
-    current_batch += 1
-
-    X_, A_, I_ = to_disjoint(*b[:-1])
-    A_ = ops.sp_matrix_to_sp_tensor(A_)
-    y_ = b[-1]
-    outs = train_step(X_, A_, I_, y_)
+loader_tr = DisjointLoader(dataset_tr, batch_size=batch_size, epochs=epochs)
+loader_va = DisjointLoader(dataset_va, batch_size=batch_size)
+for batch in loader_tr:
+    outs = train_step(*batch)
 
     model_loss += outs[0]
     model_acc += outs[1]
-    if current_batch == batches_in_epoch:
+    current_batch += 1
+    if current_batch == loader_tr.steps_per_epoch:
+        model_loss /= loader_tr.steps_per_epoch
+        model_acc /= loader_tr.steps_per_epoch
         epoch += 1
-        model_loss /= batches_in_epoch
-        model_acc /= batches_in_epoch
 
         # Compute validation loss and accuracy
-        val_loss, val_acc = evaluate(A_val, X_val, y_val, [loss_fn, acc_fn], batch_size=batch_size)
+        val_loss, val_acc = evaluate(loader_va, [loss_fn, acc_fn])
         print('Ep. {} - Loss: {:.2f} - Acc: {:.2f} - Val loss: {:.2f} - Val acc: {:.2f}'
               .format(epoch, model_loss, model_acc, val_loss, val_acc))
 
@@ -166,5 +200,6 @@ def train_step(X_, A_, I_, y_):
 ################################################################################
 print('Testing model')
 model.set_weights(best_weights)  # Load best model
-test_loss, test_acc = evaluate(A_test, X_test, y_test, [loss_fn, acc_fn], batch_size=batch_size)
+loader_te = DisjointLoader(dataset_te, batch_size=batch_size)
+test_loss, test_acc = evaluate(loader_te, [loss_fn, acc_fn])
 print('Done. Test loss: {:.4f}. Test acc: {:.2f}'.format(test_loss, test_acc))
diff --git a/examples/graph_prediction/ogbg-mol-esol_batch.py b/examples/graph_prediction/ogbg-mol-esol_batch.py
index 489be31f..41aad017 100644
--- a/examples/graph_prediction/ogbg-mol-esol_batch.py
+++ b/examples/graph_prediction/ogbg-mol-esol_batch.py
@@ -38,10 +38,10 @@
 
 # Train/test split
 idx = ogb_dataset.get_idx_split()
-tr_idx, va_idx, te_idx = idx["train"], idx["valid"], idx["test"]
-dataset_tr = dataset[tr_idx]
-dataset_va = dataset[va_idx]
-dataset_te = dataset[te_idx]
+idx_tr, idx_va, idx_te = idx["train"], idx["valid"], idx["test"]
+dataset_tr = dataset[idx_tr]
+dataset_va = dataset[idx_va]
+dataset_te = dataset[idx_te]
 
 ################################################################################
 # BUILD MODEL
diff --git a/examples/graph_prediction/ogbg-mol-hiv_disjoint.py b/examples/graph_prediction/ogbg-mol-hiv_disjoint.py
index 44cc1618..8e336a9c 100644
--- a/examples/graph_prediction/ogbg-mol-hiv_disjoint.py
+++ b/examples/graph_prediction/ogbg-mol-hiv_disjoint.py
@@ -39,10 +39,10 @@
 
 # Train/test split
 idx = ogb_dataset.get_idx_split()
-tr_idx, va_idx, te_idx = idx["train"], idx["valid"], idx["test"]
-dataset_tr = dataset[tr_idx]
-dataset_va = dataset[va_idx]
-dataset_te = dataset[te_idx]
+idx_tr, idx_va, idx_te = idx["train"], idx["valid"], idx["test"]
+dataset_tr = dataset[idx_tr]
+dataset_va = dataset[idx_va]
+dataset_te = dataset[idx_te]
 
 ################################################################################
 # BUILD MODEL
diff --git a/examples/graph_prediction/qm9_batch.py b/examples/graph_prediction/qm9_batch.py
index 5c1e1564..437281c2 100644
--- a/examples/graph_prediction/qm9_batch.py
+++ b/examples/graph_prediction/qm9_batch.py
@@ -32,7 +32,8 @@
 # Train/test split
 idxs = np.random.permutation(len(dataset))
 split = int(0.9 * len(dataset))
-dataset_tr, dataset_te = dataset[:split], dataset[split:]
+idx_tr, idx_te = np.split(idxs, [split])
+dataset_tr, dataset_te = dataset[idx_tr], dataset[idx_te]
 
 ################################################################################
 # BUILD MODEL
diff --git a/examples/graph_prediction/qm9_disjoint.py b/examples/graph_prediction/qm9_disjoint.py
index 0c7fa4f8..88b958a0 100644
--- a/examples/graph_prediction/qm9_disjoint.py
+++ b/examples/graph_prediction/qm9_disjoint.py
@@ -34,7 +34,9 @@
 # Train/test split
 idxs = np.random.permutation(len(dataset))
 split = int(0.9 * len(dataset))
-dataset_tr, dataset_te = dataset[:split], dataset[split:]
+idx_tr, idx_te = np.split(idxs, [split])
+dataset_tr, dataset_te = dataset[idx_tr], dataset[idx_te]
+
 
 ################################################################################
 # BUILD MODEL
diff --git a/examples/graph_prediction/tud_disjoint.py b/examples/graph_prediction/tud_disjoint.py
index 5e6f1357..f3687209 100644
--- a/examples/graph_prediction/tud_disjoint.py
+++ b/examples/graph_prediction/tud_disjoint.py
@@ -37,7 +37,8 @@
 # Train/test split
 idxs = np.random.permutation(len(dataset))
 split = int(0.9 * len(dataset))
-dataset_tr, dataset_te = dataset[:split], dataset[split:]
+idx_tr, idx_te = np.split(idxs, [split])
+dataset_tr, dataset_te = dataset[idx_tr], dataset[idx_te]
 
 
 ################################################################################
diff --git a/examples/node_prediction/citation_simple_gc.py b/examples/node_prediction/citation_simple_gc.py
index ac8b12e1..0058e251 100644
--- a/examples/node_prediction/citation_simple_gc.py
+++ b/examples/node_prediction/citation_simple_gc.py
@@ -13,7 +13,7 @@
 
 from spektral.datasets import citation
 from spektral.layers import GraphConv
-from spektral.utils.convolution import localpooling_filter
+from spektral.utils.convolution import gcn_filter
 
 # Load data
 dataset = 'cora'
@@ -30,7 +30,7 @@
 es_patience = 200       # Patience for early stopping
 
 # Preprocessing operations
-fltr = localpooling_filter(A).astype('f4')
+fltr = gcn_filter(A).astype('f4')
 X = X.toarray()
 
 # Pre-compute propagation
diff --git a/spektral/data/dataset.py b/spektral/data/dataset.py
index ee6ce913..67301f6e 100644
--- a/spektral/data/dataset.py
+++ b/spektral/data/dataset.py
@@ -44,13 +44,14 @@ def read(self):
     graphs is stored in `dataset.signature`. This can be useful when
     implementing a custom Loader for your dataset.
     """
-    def __init__(self, **kwargs):
+    def __init__(self, transforms=None, **kwargs):
         if not osp.exists(self.path):
             self.download()
         self.graphs = self.read()
         # Make sure that we always have at least one graph
         if len(self.graphs) == 0:
             raise ValueError('Datasets cannot be empty')
+
         self.F = None
         self.S = None
         self.n_out = None
@@ -60,6 +61,16 @@ def __init__(self, **kwargs):
         for k, v in kwargs.items():
             setattr(self, k, v)
 
+        # Apply transforms
+        if transforms is not None:
+            if not isinstance(transforms, (list, tuple)) and callable(transforms):
+                transforms = [transforms]
+            else:
+                raise ValueError('transforms must be a list of callables or '
+                                 'a callable.')
+            for t in transforms:
+                self.apply(t)
+
     @property
     def path(self):
         return osp.join(DATASET_FOLDER, self.__class__.__name__)
@@ -68,7 +79,18 @@ def read(self):
         raise NotImplementedError
 
     def download(self):
-        raise NotImplementedError
+        pass
+
+    def apply(self, transform):
+        for i in range(len(self.graphs)):
+            self.graphs[i] = transform(self.graphs[i])
+
+    def map(self, transform, reduce=None):
+        out = [transform(g) for g in self.graphs]
+        if reduce is not None and callable(reduce):
+            return reduce(out)
+        else:
+            return out
 
     def _signature(self):
         signature = {}
diff --git a/spektral/datasets/ogb.py b/spektral/datasets/ogb.py
index fa6749ae..73669885 100644
--- a/spektral/datasets/ogb.py
+++ b/spektral/datasets/ogb.py
@@ -20,10 +20,6 @@ def __init__(self, dataset, **kwargs):
     def read(self):
         return [Graph(*_elem_to_numpy(elem)) for elem in self.dataset]
 
-    def download(self):
-        # Download is handled by OGB
-        pass
-
 
 def _elem_to_numpy(elem):
     graph, label = elem
diff --git a/spektral/layers/convolutional/appnp.py b/spektral/layers/convolutional/appnp.py
index c4e8d6a9..a5e16208 100644
--- a/spektral/layers/convolutional/appnp.py
+++ b/spektral/layers/convolutional/appnp.py
@@ -26,7 +26,7 @@ class APPNP(GraphConv):
 
     - Node features of shape `([batch], N, F)`;
     - Modified Laplacian of shape `([batch], N, N)`; can be computed with
-    `spektral.utils.convolution.localpooling_filter`.
+    `spektral.utils.convolution.gcn_filter`.
 
     **Output**
 
diff --git a/spektral/layers/convolutional/graph_conv.py b/spektral/layers/convolutional/graph_conv.py
index 980e80bc..0a77eb66 100644
--- a/spektral/layers/convolutional/graph_conv.py
+++ b/spektral/layers/convolutional/graph_conv.py
@@ -3,7 +3,7 @@
 from tensorflow.keras.layers import Layer
 
 from spektral.layers import ops
-from spektral.utils import localpooling_filter
+from spektral.utils import gcn_filter
 
 
 class GraphConv(Layer):
@@ -24,7 +24,7 @@ class GraphConv(Layer):
 
     - Node features of shape `([batch], N, F)`;
     - Modified Laplacian of shape `([batch], N, N)`; can be computed with
-    `spektral.utils.convolution.localpooling_filter`.
+    `spektral.utils.convolution.gcn_filter`.
 
     **Output**
 
@@ -124,4 +124,4 @@ def get_config(self):
 
     @staticmethod
     def preprocess(A):
-        return localpooling_filter(A)
\ No newline at end of file
+        return gcn_filter(A)
\ No newline at end of file
diff --git a/spektral/transforms/degree.py b/spektral/transforms/degree.py
new file mode 100644
index 00000000..0b44a793
--- /dev/null
+++ b/spektral/transforms/degree.py
@@ -0,0 +1,23 @@
+import numpy as np
+
+from spektral.utils import one_hot
+
+
+class Degree(object):
+    def __init__(self, max_degree):
+        self.max_degree = max_degree
+
+    def __call__(self, graph):
+        degree = graph.adj.sum(1)
+        degree = one_hot(degree, self.max_degree + 1)
+        if graph.x is None:
+            graph.x = degree
+        else:
+            graph.x = np.concatenate((graph.x, degree), axis=-1)
+
+        return graph
+
+
+class MaxDegree(object):
+    def __call__(self, graph):
+        return graph.adj.sum(1).max()
diff --git a/spektral/utils/convolution.py b/spektral/utils/convolution.py
index 3fdc3014..c4ecd492 100644
--- a/spektral/utils/convolution.py
+++ b/spektral/utils/convolution.py
@@ -101,7 +101,7 @@ def rescale_laplacian(L, lmax=None):
     return L_scaled
 
 
-def localpooling_filter(A, symmetric=True):
+def gcn_filter(A, symmetric=True):
     r"""
     Computes the graph filter described in
     [Kipf & Welling (2017)](https://arxiv.org/abs/1609.02907).
diff --git a/spektral/utils/misc.py b/spektral/utils/misc.py
index 809c266e..94ee7864 100644
--- a/spektral/utils/misc.py
+++ b/spektral/utils/misc.py
@@ -109,73 +109,63 @@ def sub_eye_jagged(x):
     return x_out
 
 
-def int_to_one_hot(x, n=None):
-    """
-    Encodes x in a 1-of-n array. 
-    :param x: an integer or array of integers, such that x < n
-    :param n: an integer
-    :return: an array of shape (x.shape[0], n) if x is an array, (n, ) if
-    x is an integer
-    """
-    if isinstance(x, int):
-        if n is None:
-            raise ValueError('n is required to one-hot encode a single integer')
-        if x >= n:
-            raise ValueError('x must be smaller than n in order to one-hot encode')
-        output = np.zeros((n,))
-        output[x] = 1
-    else:
-        if n is None:
-            n = int(np.max(x) + 1)
-        else:
-            if np.max(x) >= n:
-                raise ValueError('The maximum value in x ({}) is greater than '
-                                 'n ({}), therefore 1-of-n encoding is not '
-                                 'possible'.format(np.max(x), n))
-        x = np.array(x, dtype=np.int)
-        if x.ndim == 1:
-            x = x[:, None]
-        orig_shp = x.shape
-        x = np.reshape(x, (-1, orig_shp[-1]))
-        output = np.zeros((x.shape[0], n))
-        output[np.arange(x.shape[0]), x.squeeze()] = 1
-        output = output.reshape(orig_shp[:-1] + (n,))
+def one_hot(x, depth):
+    """
+    One-hot encodes the integer array `x` in an array of length `depth`.
+    :param x: a np.array of integers.
+    :param depth: size of the one-hot vectors.
+    :return: an array of shape `x.shape + (depth, )`
+    """
+    x = np.array(x).astype(int)
+    out = np.eye(depth)[x]
 
-    return output
+    return out
 
 
-def label_to_one_hot(x, labels=None):
+def label_to_one_hot(x, labels):
     """
-    Encodes x in a 1-of-n array. 
-    :param x: any object or array of objects s.t. x is contained in `labels`. 
-    The function may behave unexpectedly if x is a single object but 
-    `hasattr(x, '__len__')`, and works best with integers or discrete entities.
-    :param labels: a list of n labels to compute the one-hot vector 
-    :return: an array of shape (x.shape[0], n) if x is an array, (n, ) if
-    x is a single object
+    One-hot encodes the integer array `x` according to the given `labels`.
+
+    :param x: a np.array of integers. Each value must be contained in `labels`.
+    :param labels: list/tuple/np.array of labels.
+    :return: an array of shape `x.shape + (len(labels), )`
     """
-    n = len(labels)
-    labels_idx = {l: i for i, l in enumerate(labels)}
-    if not hasattr(x, '__len__'):
-        output = np.zeros((n,))
-        output[labels_idx[x]] = 1
-    else:
-        x = np.array(x, dtype=np.int)
-        orig_shp = x.shape
-        x = np.reshape(x, (-1))
-        output = np.zeros((x.shape[0], n))
-        for i in range(len(x)):
-            try:
-                output[i, labels_idx[x[i]]] = 1
-            except KeyError:
-                pass
-        if len(orig_shp) == 1:
-            output_shape = orig_shp + (n,)
-        else:
-            output_shape = orig_shp[:-1] + (n,)
-        output = output.reshape(output_shape)
+    if not isinstance(labels, (list, tuple, np.ndarray)):
+        raise ValueError('labels must be list, tuple, or np.ndarray')
+    if not np.all(np.in1d(x, labels)):
+        raise ValueError('All values in x must be contained in labels')
+    depth = len(labels)
+    x = np.array(x).astype(int)
+    out = x.copy()
+    for i, label in enumerate(labels):
+        out[x == label] = i
 
-    return output
+    return one_hot(out, depth)
+
+
+def add_self_loops(a, value=1):
+    """
+    Sets the inner diagonals of `a` to `value`.
+    :param a: a np.array or scipy.sparse matrix, the innermost two dimensions
+    must be equal.
+    :param value: value to set the diagonals to.
+    :return: a np.array or scipy.sparse matrix with the same shape as `a`.
+    """
+    a = a.copy()
+    if len(a.shape) < 2:
+        raise ValueError('a must have at least rank 2')
+    n = a.shape[-1]
+    if n != a.shape[-2]:
+        raise ValueError('Innermost two dimensions must be equal. Got {}'
+                         .format(a.shape))
+    if sp.issparse(a):
+        a = a.tolil()
+        a.setdiag(value)
+        return a.tocsr()
+    else:
+        idx = np.arange(n)
+        a[..., idx, idx] = value
+        return a
 
 
 def flatten_list_gen(alist):
@@ -186,7 +176,7 @@ def flatten_list_gen(alist):
                   arbitrarily nested.
     """
     for item in alist:
-        if isinstance(item, list) or isinstance(item, np.ndarray):
+        if isinstance(item, (list, tuple, np.ndarray)):
             for i in flatten_list_gen(item):
                 yield i
         else:

From 2a9002fa7903841e7acce96e03289eafc816c330 Mon Sep 17 00:00:00 2001
From: Daniele Grattarola <daniele.grattarola@gmail.com>
Date: Tue, 3 Nov 2020 11:02:30 +0100
Subject: [PATCH 09/57] Remove Delaunay dataset

---
 docs/autogen.py                             |   8 --
 examples/graph_prediction/delaunay_batch.py |  66 -----------
 spektral/datasets/__init__.py               |   1 -
 spektral/datasets/delaunay.py               | 122 --------------------
 tests/{data => test_data}/test_dataset.py   |   0
 tests/{data => test_data}/test_graph.py     |   0
 tests/{data => test_data}/test_loaders.py   |   0
 tests/{data => test_data}/test_utils.py     |   0
 tests/test_datasets.py                      |  11 +-
 9 files changed, 1 insertion(+), 207 deletions(-)
 delete mode 100644 examples/graph_prediction/delaunay_batch.py
 delete mode 100644 spektral/datasets/delaunay.py
 rename tests/{data => test_data}/test_dataset.py (100%)
 rename tests/{data => test_data}/test_graph.py (100%)
 rename tests/{data => test_data}/test_loaders.py (100%)
 rename tests/{data => test_data}/test_utils.py (100%)

diff --git a/docs/autogen.py b/docs/autogen.py
index a080fd5c..f28069af 100644
--- a/docs/autogen.py
+++ b/docs/autogen.py
@@ -131,14 +131,6 @@
         'methods': [],
         'classes': []
     },
-    {
-        'page': 'datasets.md',
-        'functions': [
-            datasets.delaunay.generate_data
-        ],
-        'methods': [],
-        'classes': []
-    },
     # Utils ####################################################################
     {
         'page': 'utils/data.md',
diff --git a/examples/graph_prediction/delaunay_batch.py b/examples/graph_prediction/delaunay_batch.py
deleted file mode 100644
index 2a02bbbd..00000000
--- a/examples/graph_prediction/delaunay_batch.py
+++ /dev/null
@@ -1,66 +0,0 @@
-"""
-This example shows how to perform graph classification with a synthetic dataset
-of Delaunay triangulations, using a graph attention network (Velickovic et al.)
-in batch mode.
-"""
-
-from sklearn.model_selection import train_test_split
-from tensorflow.keras.callbacks import EarlyStopping
-from tensorflow.keras.layers import Input, Dense
-from tensorflow.keras.models import Model
-from tensorflow.keras.optimizers import Adam
-from tensorflow.keras.regularizers import l2
-
-from spektral.datasets import delaunay
-from spektral.layers import GraphAttention, GlobalAttentionPool
-
-# Load data
-A, X, y = delaunay.generate_data(return_type='numpy', classes=[0, 5])
-
-# Parameters
-N = X.shape[-2]          # Number of nodes in the graphs
-F = X.shape[-1]          # Original feature dimensionality
-n_classes = y.shape[-1]  # Number of classes
-l2_reg = 5e-4            # Regularization rate for l2
-learning_rate = 1e-3     # Learning rate for Adam
-epochs = 20000           # Number of training epochs
-batch_size = 32          # Batch size
-es_patience = 200        # Patience fot early stopping
-
-# Train/test split
-A_train, A_test, \
-x_train, x_test, \
-y_train, y_test = train_test_split(A, X, y, test_size=0.1)
-
-# Model definition
-X_in = Input(shape=(N, F))
-A_in = Input((N, N))
-
-gc1 = GraphAttention(32, activation='relu', kernel_regularizer=l2(l2_reg))([X_in, A_in])
-gc2 = GraphAttention(32, activation='relu', kernel_regularizer=l2(l2_reg))([gc1, A_in])
-pool = GlobalAttentionPool(128)(gc2)
-
-output = Dense(n_classes, activation='softmax')(pool)
-
-# Build model
-model = Model(inputs=[X_in, A_in], outputs=output)
-optimizer = Adam(lr=learning_rate)
-model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['acc'])
-model.summary()
-
-# Train model
-model.fit([x_train, A_train],
-          y_train,
-          batch_size=batch_size,
-          validation_split=0.1,
-          epochs=epochs,
-          callbacks=[
-              EarlyStopping(patience=es_patience, restore_best_weights=True)
-          ])
-
-# Evaluate model
-print('Evaluating model.')
-eval_results = model.evaluate([x_test, A_test],
-                              y_test,
-                              batch_size=batch_size)
-print('Done. Test loss: {:.4f}. Test acc: {:.2f}'.format(*eval_results))
diff --git a/spektral/datasets/__init__.py b/spektral/datasets/__init__.py
index be4c234b..c44da679 100644
--- a/spektral/datasets/__init__.py
+++ b/spektral/datasets/__init__.py
@@ -1,5 +1,4 @@
 from . import citation
-from . import delaunay
 from . import graphsage
 from . import mnist
 from . import ogb
diff --git a/spektral/datasets/delaunay.py b/spektral/datasets/delaunay.py
deleted file mode 100644
index 2f54b8b1..00000000
--- a/spektral/datasets/delaunay.py
+++ /dev/null
@@ -1,122 +0,0 @@
-import numpy as np
-from scipy.spatial import Delaunay
-
-from spektral.utils import label_to_one_hot, numpy_to_nx
-
-RETURN_TYPES = {'numpy', 'networkx'}
-
-
-def generate_data(classes=0, n_samples_in_class=1000, n_nodes=7, support_low=0.,
-                  support_high=10., drift_amount=1.0, one_hot_labels=True,
-                  support=None, seed=None, return_type='numpy'):
-    """
-    Generates a dataset of Delaunay triangulations as described by
-    [Zambon et al. (2017)](https://arxiv.org/abs/1706.06941).
-
-    Node attributes are the 2D coordinates of the points.
-    Two nodes are connected if they share an edge in the Delaunay triangulation.
-    Labels represent the class of the graph (0 to 20, each class index i
-    represent the "difficulty" of the classification problem 0 v. i. In other
-    words, the higher the class index, the more similar the class is to class 0).
-
-    :param classes: indices of the classes to load (integer, or list of integers
-    between 0 and 20);
-    :param n_samples_in_class: number of generated samples per class;
-    :param n_nodes: number of nodes in a graph;
-    :param support_low: lower bound of the uniform distribution from which the 
-    support is generated;
-    :param support_high: upper bound of the uniform distribution from which the 
-    support is generated;
-    :param drift_amount: coefficient to control the amount of change between 
-    classes;
-    :param one_hot_labels: one-hot encode dataset labels;
-    :param support: custom support to use instead of generating it randomly; 
-    :param seed: random numpy seed;
-    :param return_type: `'numpy'` or `'networkx'`, data format to return;
-    :return:
-    - if `return_type='numpy'`, the adjacency matrix, node features, and
-    an array containing labels;
-    - if `return_type='networkx'`, a list of graphs in Networkx format, and an
-    array containing labels;
-    """
-    if return_type not in RETURN_TYPES:
-        raise ValueError('Possible return_type: {}'.format(RETURN_TYPES))
-
-    if isinstance(classes, int):
-        classes = [classes]
-
-    if max(classes) > 20 or min(classes) < 0:
-        raise ValueError('Class indices must be between 0 and 20')
-
-    r_classes = list(reversed(classes))
-    if r_classes[-1] == 0:
-        r_classes.insert(0, r_classes.pop(-1))
-
-    # Support points
-    np.random.seed(seed)
-    if support is None:
-        support = np.random.uniform(support_low, support_high, (1, n_nodes, 2))
-    else:
-        try:
-            assert support.shape == (1, n_nodes, 2)
-        except AssertionError:
-            print('The given support doesn\'t have shape (1, n_nodes, 2) as'
-                  'expected. Attempting to reshape.')
-            support = support.reshape(1, n_nodes, 2)
-
-    # Compute node features
-    node_features = []
-    # Other node features
-    for idx, i in enumerate(r_classes):
-        if i == 0:
-            concept_0 = np.repeat(support, n_samples_in_class, 0)
-            noise_0 = np.random.normal(0, 1, (n_samples_in_class, n_nodes, 2))
-            class_0 = concept_0 + noise_0
-            node_features.append(class_0)
-        else:
-            radius = 10. * ((2./3.) ** (drift_amount * (i - 1)))
-            phase = np.random.uniform(0, 2 * np.pi, (n_nodes, 1))
-            perturb_i_x = radius * np.cos(phase)
-            perturb_i_y = radius * np.sin(phase)
-            perturb_i = np.concatenate((perturb_i_x, perturb_i_y), axis=-1)
-            support_i = support + perturb_i
-            concept_i = np.repeat(support_i, n_samples_in_class, 0)
-            noise_i = np.random.normal(0, 1, (n_samples_in_class, n_nodes, 2))
-            class_i = concept_i + noise_i
-            node_features.append(class_i)
-    node_features = np.array(node_features).reshape((-1, n_nodes, 2))
-
-    # Compute adjacency matrices
-    adjacency = []
-    for nf in node_features:
-        adj = _compute_adj(nf)
-        adjacency.append(adj)
-    adjacency = np.array(adjacency)
-
-    # Compute labels
-    labels = np.repeat(classes, n_samples_in_class)
-    if one_hot_labels:
-        labels = label_to_one_hot(labels, labels=classes)
-
-    if return_type == 'numpy':
-        return adjacency, node_features, labels
-    elif return_type == 'networkx':
-        graphs = numpy_to_nx(adjacency, node_features=node_features, nf_name='coords')
-        return graphs, labels
-    else:
-        raise NotImplementedError
-
-
-def _compute_adj(x):
-    """
-    Computes the Delaunay triangulation of the given points
-    :param x: array of shape (num_nodes, 2)
-    :return: the computed adjacency matrix
-    """
-    tri = Delaunay(x)
-    edges_explicit = np.concatenate((tri.vertices[:, :2],
-                                     tri.vertices[:, 1:],
-                                     tri.vertices[:, ::2]), axis=0)
-    adj = np.zeros((x.shape[0], x.shape[0]))
-    adj[edges_explicit[:, 0], edges_explicit[:, 1]] = 1.
-    return np.clip(adj + adj.T, 0, 1)
diff --git a/tests/data/test_dataset.py b/tests/test_data/test_dataset.py
similarity index 100%
rename from tests/data/test_dataset.py
rename to tests/test_data/test_dataset.py
diff --git a/tests/data/test_graph.py b/tests/test_data/test_graph.py
similarity index 100%
rename from tests/data/test_graph.py
rename to tests/test_data/test_graph.py
diff --git a/tests/data/test_loaders.py b/tests/test_data/test_loaders.py
similarity index 100%
rename from tests/data/test_loaders.py
rename to tests/test_data/test_loaders.py
diff --git a/tests/data/test_utils.py b/tests/test_data/test_utils.py
similarity index 100%
rename from tests/data/test_utils.py
rename to tests/test_data/test_utils.py
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index 10b98910..c5bc72a0 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -1,5 +1,5 @@
-from spektral.datasets import delaunay, qm9, citation, graphsage, mnist, tudataset
 from spektral.data import DisjointLoader, BatchLoader
+from spektral.datasets import qm9, citation, graphsage, mnist, tudataset
 
 batch_size = 3
 
@@ -28,15 +28,6 @@ def test_graphsage():
         graphsage.load_data(dataset_name)
 
 
-def test_delaunay():
-    adj, nf, labels = delaunay.generate_data(return_type='numpy', classes=[0, 1, 2])
-    correctly_padded(adj, nf, None)
-    assert adj.shape[0] == labels.shape[0]
-
-    # Test that it doesn't crash
-    delaunay.generate_data(return_type='networkx')
-
-
 def test_mnist():
     mnist.load_data(k=8, noise_level=0.1)
 

From 4f0e77a669e5ada82d8b7c7f1f5241b26ac2390a Mon Sep 17 00:00:00 2001
From: Daniele Grattarola <daniele.grattarola@gmail.com>
Date: Tue, 3 Nov 2020 11:14:46 +0100
Subject: [PATCH 10/57] Add new transforms

---
 examples/graph_prediction/custom_dataset.py |  8 ++------
 spektral/transforms/adj_to_sp_tensor.py     |  9 +++++++++
 spektral/transforms/layer_preprocess.py     | 18 ++++++++++++++++++
 3 files changed, 29 insertions(+), 6 deletions(-)
 create mode 100644 spektral/transforms/adj_to_sp_tensor.py
 create mode 100644 spektral/transforms/layer_preprocess.py

diff --git a/examples/graph_prediction/custom_dataset.py b/examples/graph_prediction/custom_dataset.py
index 4773b44b..c1188a38 100644
--- a/examples/graph_prediction/custom_dataset.py
+++ b/examples/graph_prediction/custom_dataset.py
@@ -28,7 +28,7 @@
 from spektral.data import Dataset, Graph, DisjointLoader
 from spektral.layers import GraphConvSkip, GlobalAvgPool
 from spektral.layers.pooling import TopKPool
-from spektral.utils.convolution import normalized_adjacency
+from spektral.transforms.normalize_adj import NormalizeAdj
 
 ################################################################################
 # PARAMETERS
@@ -81,16 +81,12 @@ def make_graph():
         return [make_graph() for _ in range(self.n_graphs)]
 
 
-dataset = MyDataset(1000)
+dataset = MyDataset(1000, transforms=NormalizeAdj())
 
 # Parameters
 F = dataset.F          # Dimension of node features
 n_out = dataset.n_out  # Dimension of the target
 
-# Preprocessing
-for g in dataset.graphs:
-    g.adj = normalized_adjacency(g.adj)
-
 # Train/valid/test split
 idxs = np.random.permutation(len(dataset))
 split_va, split_te = int(0.8 * len(dataset)), int(0.9 * len(dataset))
diff --git a/spektral/transforms/adj_to_sp_tensor.py b/spektral/transforms/adj_to_sp_tensor.py
new file mode 100644
index 00000000..212eef9f
--- /dev/null
+++ b/spektral/transforms/adj_to_sp_tensor.py
@@ -0,0 +1,9 @@
+from spektral.layers.ops import sp_matrix_to_sp_tensor
+
+
+class AdjToSpTensor(object):
+    def __call__(self, graph):
+        if graph.adj is not None:
+            graph.adj = sp_matrix_to_sp_tensor(graph.adj)
+
+        return graph
diff --git a/spektral/transforms/layer_preprocess.py b/spektral/transforms/layer_preprocess.py
new file mode 100644
index 00000000..482577c6
--- /dev/null
+++ b/spektral/transforms/layer_preprocess.py
@@ -0,0 +1,18 @@
+class LayerPreprocess(object):
+    """
+    Applies the `preprocess` function of a convolutional Layer to the adjacency
+    matrix.
+
+    **Arguments**
+
+    - `layer_class`: the class of a layer from `spektral.layers.convolutional`,
+    or any Layer that implements a `preprocess(adj)` method.
+    """
+    def __init__(self, layer_class):
+        self.layer_class = layer_class
+
+    def __call__(self, graph):
+        if graph.adj is not None and hasattr(self.layer_class, 'preprocess'):
+            graph.adj = self.layer_class.preprocess(graph.adj)
+
+        return graph

From 0be3308d4b886d051a9a1e14572eeb05652d422e Mon Sep 17 00:00:00 2001
From: Daniele Grattarola <daniele.grattarola@gmail.com>
Date: Tue, 3 Nov 2020 18:10:36 +0100
Subject: [PATCH 11/57] Add SingleLoader Implement citation datasets as
 dataset.py Add GCNFilter and NormalizeAdj transforms Update examples

---
 .../graph_prediction/ogbg-mol-esol_batch.py   |   2 +-
 .../graph_prediction/ogbg-mol-hiv_disjoint.py |   2 +-
 examples/graph_prediction/qm9_batch.py        |   2 +-
 examples/graph_prediction/qm9_disjoint.py     |   2 +-
 examples/graph_prediction/tud_disjoint.py     |   4 +-
 examples/node_prediction/citation_arma.py     |  64 +++--
 examples/node_prediction/citation_cheby.py    |  58 ++---
 examples/node_prediction/citation_gat.py      |  57 ++---
 examples/node_prediction/citation_gcn.py      |  56 ++---
 .../node_prediction/citation_simple_gc.py     |  78 +++---
 spektral/data/dataset.py                      |   8 +-
 spektral/data/loaders.py                      |  27 +-
 spektral/datasets/__init__.py                 |   8 +-
 spektral/datasets/citation.py                 | 232 ++++++++----------
 spektral/transforms/__init__.py               |   5 +
 spektral/transforms/gcn_filter.py             |  12 +
 spektral/transforms/normalize_adj.py          |  12 +
 tests/test_datasets.py                        |  18 +-
 18 files changed, 330 insertions(+), 317 deletions(-)
 create mode 100644 spektral/transforms/__init__.py
 create mode 100644 spektral/transforms/gcn_filter.py
 create mode 100644 spektral/transforms/normalize_adj.py

diff --git a/examples/graph_prediction/ogbg-mol-esol_batch.py b/examples/graph_prediction/ogbg-mol-esol_batch.py
index 41aad017..21828032 100644
--- a/examples/graph_prediction/ogbg-mol-esol_batch.py
+++ b/examples/graph_prediction/ogbg-mol-esol_batch.py
@@ -13,7 +13,7 @@
 from tensorflow.keras.optimizers import Adam
 
 from spektral.data import BatchLoader
-from spektral.datasets.ogb import OGB
+from spektral.datasets import OGB
 from spektral.layers import GraphConv, MinCutPool, GlobalSumPool
 
 ################################################################################
diff --git a/examples/graph_prediction/ogbg-mol-hiv_disjoint.py b/examples/graph_prediction/ogbg-mol-hiv_disjoint.py
index 8e336a9c..29649ffd 100644
--- a/examples/graph_prediction/ogbg-mol-hiv_disjoint.py
+++ b/examples/graph_prediction/ogbg-mol-hiv_disjoint.py
@@ -15,7 +15,7 @@
 from tensorflow.keras.optimizers import Adam
 
 from spektral.data import DisjointLoader
-from spektral.datasets.ogb import OGB
+from spektral.datasets import OGB
 from spektral.layers import EdgeConditionedConv, GlobalSumPool
 
 ################################################################################
diff --git a/examples/graph_prediction/qm9_batch.py b/examples/graph_prediction/qm9_batch.py
index 437281c2..96e7f523 100644
--- a/examples/graph_prediction/qm9_batch.py
+++ b/examples/graph_prediction/qm9_batch.py
@@ -9,7 +9,7 @@
 from tensorflow.keras.optimizers import Adam
 
 from spektral.data import BatchLoader
-from spektral.datasets.qm9 import QM9
+from spektral.datasets import QM9
 from spektral.layers import EdgeConditionedConv, GlobalSumPool
 
 ################################################################################
diff --git a/examples/graph_prediction/qm9_disjoint.py b/examples/graph_prediction/qm9_disjoint.py
index 88b958a0..b02e65c4 100644
--- a/examples/graph_prediction/qm9_disjoint.py
+++ b/examples/graph_prediction/qm9_disjoint.py
@@ -11,7 +11,7 @@
 from tensorflow.keras.optimizers import Adam
 
 from spektral.data import DisjointLoader
-from spektral.datasets.qm9 import QM9
+from spektral.datasets import QM9
 from spektral.layers import EdgeConditionedConv, GlobalSumPool
 
 ################################################################################
diff --git a/examples/graph_prediction/tud_disjoint.py b/examples/graph_prediction/tud_disjoint.py
index f3687209..a1518a35 100644
--- a/examples/graph_prediction/tud_disjoint.py
+++ b/examples/graph_prediction/tud_disjoint.py
@@ -13,7 +13,7 @@
 from tensorflow.keras.optimizers import Adam
 
 from spektral.data import DisjointLoader
-from spektral.datasets import tudataset
+from spektral.datasets import TUDataset
 from spektral.layers import GINConv, GlobalAvgPool
 
 ################################################################################
@@ -28,7 +28,7 @@
 ################################################################################
 # LOAD DATA
 ################################################################################
-dataset = tudataset.TUDataset('PROTEINS', clean=True)
+dataset = TUDataset('PROTEINS', clean=True)
 
 # Parameters
 F = dataset.F          # Dimension of node features
diff --git a/examples/node_prediction/citation_arma.py b/examples/node_prediction/citation_arma.py
index f062cd3a..07561e66 100644
--- a/examples/node_prediction/citation_arma.py
+++ b/examples/node_prediction/citation_arma.py
@@ -11,31 +11,30 @@
 from tensorflow.keras.optimizers import Adam
 from tensorflow.keras.regularizers import l2
 
-from spektral.datasets import citation
+from spektral.data.loaders import SingleLoader
+from spektral.datasets.citation import Citation
 from spektral.layers import ARMAConv
+from spektral.transforms import LayerPreprocess, AdjToSpTensor
 
 # Load data
-dataset = 'cora'
-A, X, y, train_mask, val_mask, test_mask = citation.load_data(dataset)
+dataset = Citation('cora',
+                   transforms=[LayerPreprocess(ARMAConv), AdjToSpTensor()])
+mask_tr, mask_va, mask_te = dataset.mask_tr, dataset.mask_va, dataset.mask_te
 
 # Parameters
-channels = 16           # Number of channels in the first layer
-iterations = 1          # Number of iterations to approximate each ARMA(1)
-order = 2               # Order of the ARMA filter (number of parallel stacks)
-share_weights = True    # Share weights in each ARMA stack
-N = X.shape[0]          # Number of nodes in the graph
-F = X.shape[1]          # Original feature dimensionality
-n_classes = y.shape[1]  # Number of classes
-dropout = 0.5           # Dropout rate applied between layers
-dropout_skip = 0.75     # Dropout rate for the internal skip connection of ARMA
-l2_reg = 5e-5           # L2 regularization rate
-learning_rate = 1e-2    # Learning rate
-epochs = 20000          # Number of training epochs
-es_patience = 100       # Patience for early stopping
-
-# Preprocessing operations
-fltr = ARMAConv.preprocess(A).astype('f4')
-X = X.toarray()
+channels = 16          # Number of channels in the first layer
+iterations = 1         # Number of iterations to approximate each ARMA(1)
+order = 2              # Order of the ARMA filter (number of parallel stacks)
+share_weights = True   # Share weights in each ARMA stack
+N = dataset.N          # Number of nodes in the graph
+F = dataset.F          # Original size of node features
+n_out = dataset.n_out  # Number of classes
+dropout = 0.5          # Dropout rate for the features
+dropout_skip = 0.75    # Dropout rate for the internal skip connection of ARMA
+l2_reg = 5e-5          # L2 regularization rate
+learning_rate = 1e-2   # Learning rate
+epochs = 20000         # Number of training epochs
+patience = 100         # Patience for early stopping
 
 # Model definition
 X_in = Input(shape=(F, ))
@@ -50,7 +49,7 @@
                 gcn_activation='elu',
                 kernel_regularizer=l2(l2_reg))([X_in, fltr_in])
 gc_2 = Dropout(dropout)(gc_1)
-gc_2 = ARMAConv(n_classes,
+gc_2 = ARMAConv(n_out,
                 iterations=1,
                 order=1,
                 share_weights=share_weights,
@@ -68,24 +67,19 @@
 model.summary()
 
 # Train model
-validation_data = ([X, fltr], y, val_mask)
-model.fit([X, fltr],
-          y,
-          sample_weight=train_mask,
+loader_tr = SingleLoader(dataset, sample_weights=mask_tr)
+loader_va = SingleLoader(dataset, sample_weights=mask_va)
+model.fit(loader_tr.tf(),
+          steps_per_epoch=loader_tr.steps_per_epoch,
+          validation_data=loader_va.tf(),
+          validation_steps=loader_va.steps_per_epoch,
           epochs=epochs,
-          batch_size=N,
-          validation_data=validation_data,
-          shuffle=False,  # Shuffling data means shuffling the whole graph
-          callbacks=[
-              EarlyStopping(patience=es_patience,  restore_best_weights=True)
-          ])
+          callbacks=[EarlyStopping(patience=patience, restore_best_weights=True)])
 
 # Evaluate model
 print('Evaluating model.')
-eval_results = model.evaluate([X, fltr],
-                              y,
-                              sample_weight=test_mask,
-                              batch_size=N)
+loader_te = SingleLoader(dataset, sample_weights=mask_te)
+eval_results = model.evaluate(loader_te.tf(), steps=loader_te.steps_per_epoch)
 print('Done.\n'
       'Test loss: {}\n'
       'Test accuracy: {}'.format(*eval_results))
diff --git a/examples/node_prediction/citation_cheby.py b/examples/node_prediction/citation_cheby.py
index ac97d070..f814289e 100644
--- a/examples/node_prediction/citation_cheby.py
+++ b/examples/node_prediction/citation_cheby.py
@@ -12,28 +12,27 @@
 from tensorflow.keras.optimizers import Adam
 from tensorflow.keras.regularizers import l2
 
-from spektral.datasets import citation
+from spektral.data.loaders import SingleLoader
+from spektral.datasets.citation import Citation
 from spektral.layers import ChebConv
+from spektral.transforms import LayerPreprocess, AdjToSpTensor
 
 # Load data
-dataset = 'cora'
-A, X, y, train_mask, val_mask, test_mask = citation.load_data(dataset)
+dataset = Citation('cora',
+                   transforms=[LayerPreprocess(ChebConv), AdjToSpTensor()])
+mask_tr, mask_va, mask_te = dataset.mask_tr, dataset.mask_va, dataset.mask_te
 
 # Parameters
-channels = 16           # Number of channels in the first layer
-K = 2                   # Max degree of the Chebyshev polynomials
-N = X.shape[0]          # Number of nodes in the graph
-F = X.shape[1]          # Original size of node features
-n_classes = y.shape[1]  # Number of classes
-dropout = 0.5           # Dropout rate for the features
-l2_reg = 5e-4 / 2       # L2 regularization rate
-learning_rate = 1e-2    # Learning rate
-epochs = 200            # Number of training epochs
-es_patience = 10        # Patience for early stopping
-
-# Preprocessing operations
-fltr = ChebConv.preprocess(A).astype('f4')
-X = X.toarray()
+channels = 16          # Number of channels in the first layer
+K = 2                  # Max degree of the Chebyshev polynomials
+N = dataset.N          # Number of nodes in the graph
+F = dataset.F          # Original size of node features
+n_out = dataset.n_out  # Number of classes
+dropout = 0.5          # Dropout rate for the features
+l2_reg = 5e-4 / 2      # L2 regularization rate
+learning_rate = 1e-2   # Learning rate
+epochs = 200           # Number of training epochs
+patience = 10          # Patience for early stopping
 
 # Model definition
 X_in = Input(shape=(F, ))
@@ -46,7 +45,7 @@
                         kernel_regularizer=l2(l2_reg),
                         use_bias=False)([dropout_1, fltr_in])
 dropout_2 = Dropout(dropout)(graph_conv_1)
-graph_conv_2 = ChebConv(n_classes,
+graph_conv_2 = ChebConv(n_out,
                         K=K,
                         activation='softmax',
                         use_bias=False)([dropout_2, fltr_in])
@@ -60,24 +59,19 @@
 model.summary()
 
 # Train model
-validation_data = ([X, fltr], y, val_mask)
-model.fit([X, fltr],
-          y,
-          sample_weight=train_mask,
+loader_tr = SingleLoader(dataset, sample_weights=mask_tr)
+loader_va = SingleLoader(dataset, sample_weights=mask_va)
+model.fit(loader_tr.tf(),
+          steps_per_epoch=loader_tr.steps_per_epoch,
+          validation_data=loader_va.tf(),
+          validation_steps=loader_va.steps_per_epoch,
           epochs=epochs,
-          batch_size=N,
-          validation_data=validation_data,
-          shuffle=False,  # Shuffling data means shuffling the whole graph
-          callbacks=[
-              EarlyStopping(patience=es_patience,  restore_best_weights=True)
-          ])
+          callbacks=[EarlyStopping(patience=patience, restore_best_weights=True)])
 
 # Evaluate model
 print('Evaluating model.')
-eval_results = model.evaluate([X, fltr],
-                              y,
-                              sample_weight=test_mask,
-                              batch_size=N)
+loader_te = SingleLoader(dataset, sample_weights=mask_te)
+eval_results = model.evaluate(loader_te.tf(), steps=loader_te.steps_per_epoch)
 print('Done.\n'
       'Test loss: {}\n'
       'Test accuracy: {}'.format(*eval_results))
diff --git a/examples/node_prediction/citation_gat.py b/examples/node_prediction/citation_gat.py
index cf90107b..6eeeddf1 100644
--- a/examples/node_prediction/citation_gat.py
+++ b/examples/node_prediction/citation_gat.py
@@ -11,28 +11,28 @@
 from tensorflow.keras.optimizers import Adam
 from tensorflow.keras.regularizers import l2
 
-from spektral.datasets import citation
+from spektral.data.loaders import SingleLoader
+from spektral.datasets.citation import Citation
 from spektral.layers import GraphAttention
+from spektral.transforms import LayerPreprocess, AdjToSpTensor
 
 # Load data
-dataset = 'cora'
-A, X, y, train_mask, val_mask, test_mask = citation.load_data(dataset)
+dataset = Citation('cora',
+                   transforms=[LayerPreprocess(GraphAttention), AdjToSpTensor()])
+mask_tr, mask_va, mask_te = dataset.mask_tr, dataset.mask_va, dataset.mask_te
 
 # Parameters
-channels = 8            # Number of channel in each head of the first GAT layer
-n_attn_heads = 8        # Number of attention heads in first GAT layer
-N = X.shape[0]          # Number of nodes in the graph
-F = X.shape[1]          # Original size of node features
-n_classes = y.shape[1]  # Number of classes
-dropout = 0.6           # Dropout rate for the features and adjacency matrix
-l2_reg = 5e-6           # L2 regularization rate
-learning_rate = 5e-3    # Learning rate
-epochs = 20000          # Number of training epochs
-es_patience = 100       # Patience for early stopping
+channels = 8           # Number of channels in each head of the first GAT layer
+n_attn_heads = 8       # Number of attention heads in first GAT layer
+N = dataset.N          # Number of nodes in the graph
+F = dataset.F          # Original size of node features
+n_out = dataset.n_out  # Number of classes
+dropout = 0.6          # Dropout rate for the features and adjacency matrix
+l2_reg = 5e-6          # L2 regularization rate
+learning_rate = 5e-3   # Learning rate
+epochs = 20000         # Number of training epochs
+patience = 100         # Patience for early stopping
 
-# Preprocessing operations
-A = A.astype('f4')
-X = X.toarray()
 
 # Model definition
 X_in = Input(shape=(F, ))
@@ -48,7 +48,7 @@
                                    attn_kernel_regularizer=l2(l2_reg)
                                    )([dropout_1, A_in])
 dropout_2 = Dropout(dropout)(graph_attention_1)
-graph_attention_2 = GraphAttention(n_classes,
+graph_attention_2 = GraphAttention(n_out,
                                    attn_heads=1,
                                    concat_heads=False,
                                    dropout_rate=dropout,
@@ -66,24 +66,19 @@
 model.summary()
 
 # Train model
-validation_data = ([X, A], y, val_mask)
-model.fit([X, A],
-          y,
-          sample_weight=train_mask,
+loader_tr = SingleLoader(dataset, sample_weights=mask_tr)
+loader_va = SingleLoader(dataset, sample_weights=mask_va)
+model.fit(loader_tr.tf(),
+          steps_per_epoch=loader_tr.steps_per_epoch,
+          validation_data=loader_va.tf(),
+          validation_steps=loader_va.steps_per_epoch,
           epochs=epochs,
-          batch_size=N,
-          validation_data=validation_data,
-          shuffle=False,  # Shuffling data means shuffling the whole graph
-          callbacks=[
-              EarlyStopping(patience=es_patience, restore_best_weights=True)
-          ])
+          callbacks=[EarlyStopping(patience=patience, restore_best_weights=True)])
 
 # Evaluate model
 print('Evaluating model.')
-eval_results = model.evaluate([X, A],
-                              y,
-                              sample_weight=test_mask,
-                              batch_size=N)
+loader_te = SingleLoader(dataset, sample_weights=mask_te)
+eval_results = model.evaluate(loader_te.tf(), steps=loader_te.steps_per_epoch)
 print('Done.\n'
       'Test loss: {}\n'
       'Test accuracy: {}'.format(*eval_results))
diff --git a/examples/node_prediction/citation_gcn.py b/examples/node_prediction/citation_gcn.py
index c262c205..ae358f28 100644
--- a/examples/node_prediction/citation_gcn.py
+++ b/examples/node_prediction/citation_gcn.py
@@ -11,27 +11,26 @@
 from tensorflow.keras.optimizers import Adam
 from tensorflow.keras.regularizers import l2
 
-from spektral.datasets import citation
+from spektral.data.loaders import SingleLoader
+from spektral.datasets.citation import Citation
 from spektral.layers import GraphConv
+from spektral.transforms import LayerPreprocess, AdjToSpTensor
 
 # Load data
-dataset = 'cora'
-A, X, y, train_mask, val_mask, test_mask = citation.load_data(dataset)
+dataset = Citation('cora',
+                   transforms=[LayerPreprocess(GraphConv), AdjToSpTensor()])
+mask_tr, mask_va, mask_te = dataset.mask_tr, dataset.mask_va, dataset.mask_te
 
 # Parameters
-channels = 16           # Number of channels in the first layer
-N = X.shape[0]          # Number of nodes in the graph
-F = X.shape[1]          # Original size of node features
-n_classes = y.shape[1]  # Number of classes
-dropout = 0.5           # Dropout rate for the features
-l2_reg = 5e-4 / 2       # L2 regularization rate
-learning_rate = 1e-2    # Learning rate
-epochs = 200            # Number of training epochs
-es_patience = 10        # Patience for early stopping
-
-# Preprocessing operations
-fltr = GraphConv.preprocess(A).astype('f4')
-X = X.toarray()
+channels = 16          # Number of channels in the first layer
+N = dataset.N          # Number of nodes in the graph
+F = dataset.F          # Original size of node features
+n_out = dataset.n_out  # Number of classes
+dropout = 0.5          # Dropout rate for the features
+l2_reg = 5e-4 / 2      # L2 regularization rate
+learning_rate = 1e-2   # Learning rate
+epochs = 200           # Number of training epochs
+patience = 10          # Patience for early stopping
 
 # Model definition
 X_in = Input(shape=(F, ))
@@ -43,7 +42,7 @@
                          kernel_regularizer=l2(l2_reg),
                          use_bias=False)([dropout_1, fltr_in])
 dropout_2 = Dropout(dropout)(graph_conv_1)
-graph_conv_2 = GraphConv(n_classes,
+graph_conv_2 = GraphConv(n_out,
                          activation='softmax',
                          use_bias=False)([dropout_2, fltr_in])
 
@@ -56,24 +55,19 @@
 model.summary()
 
 # Train model
-validation_data = ([X, fltr], y, val_mask)
-model.fit([X, fltr],
-          y,
-          sample_weight=train_mask,
+loader_tr = SingleLoader(dataset, sample_weights=mask_tr)
+loader_va = SingleLoader(dataset, sample_weights=mask_va)
+model.fit(loader_tr.tf(),
+          steps_per_epoch=loader_tr.steps_per_epoch,
+          validation_data=loader_va.tf(),
+          validation_steps=loader_va.steps_per_epoch,
           epochs=epochs,
-          batch_size=N,
-          validation_data=validation_data,
-          shuffle=False,  # Shuffling data means shuffling the whole graph
-          callbacks=[
-              EarlyStopping(patience=es_patience,  restore_best_weights=True)
-          ])
+          callbacks=[EarlyStopping(patience=patience, restore_best_weights=True)])
 
 # Evaluate model
 print('Evaluating model.')
-eval_results = model.evaluate([X, fltr],
-                              y,
-                              sample_weight=test_mask,
-                              batch_size=N)
+loader_te = SingleLoader(dataset, sample_weights=mask_te)
+eval_results = model.evaluate(loader_te.tf(), steps=loader_te.steps_per_epoch)
 print('Done.\n'
       'Test loss: {}\n'
       'Test accuracy: {}'.format(*eval_results))
diff --git a/examples/node_prediction/citation_simple_gc.py b/examples/node_prediction/citation_simple_gc.py
index 0058e251..6e85afb7 100644
--- a/examples/node_prediction/citation_simple_gc.py
+++ b/examples/node_prediction/citation_simple_gc.py
@@ -3,6 +3,10 @@
 
 Simplifying Graph Convolutional Networks (https://arxiv.org/abs/1902.07153)
 Felix Wu, Tianyi Zhang, Amauri Holanda de Souza Jr., Christopher Fifty, Tao Yu, Kilian Q. Weinberger
+
+To implement it, we define a custom transform for the adjacency matrix. A
+transform is simply a callable object that takes a Graph as input and returns
+a Graph.
 """
 
 from tensorflow.keras.callbacks import EarlyStopping
@@ -11,37 +15,44 @@
 from tensorflow.keras.optimizers import Adam
 from tensorflow.keras.regularizers import l2
 
-from spektral.datasets import citation
+from spektral.data.loaders import SingleLoader
+from spektral.datasets.citation import Citation
 from spektral.layers import GraphConv
-from spektral.utils.convolution import gcn_filter
+from spektral.transforms import LayerPreprocess, AdjToSpTensor
 
-# Load data
-dataset = 'cora'
-A, X, y, train_mask, val_mask, test_mask = citation.load_data(dataset)
 
-# Parameters
-K = 2                   # Degree of propagation
-N = X.shape[0]          # Number of nodes in the graph
-F = X.shape[1]          # Original size of node features
-n_classes = y.shape[1]  # Number of classes
-l2_reg = 5e-6           # L2 regularization rate
-learning_rate = 0.2     # Learning rate
-epochs = 20000          # Number of training epochs
-es_patience = 200       # Patience for early stopping
+class SGCN:
+    def __init__(self, K):
+        self.K = K
+
+    def __call__(self, graph):
+        out = graph.adj
+        for i in range(self.K - 1):
+            out = out.dot(out)
+        out.sort_indices()
+        graph.adj = out
+        return graph
 
-# Preprocessing operations
-fltr = gcn_filter(A).astype('f4')
-X = X.toarray()
 
-# Pre-compute propagation
-for i in range(K - 1):
-    fltr = fltr.dot(fltr)
-fltr.sort_indices()
+# Load data
+K = 2  # Propagation steps for SGCN
+dataset = Citation('cora',
+                   transforms=[LayerPreprocess(GraphConv), SGCN(K), AdjToSpTensor()])
+mask_tr, mask_va, mask_te = dataset.mask_tr, dataset.mask_va, dataset.mask_te
+
+# Parameters
+N = dataset.N          # Number of nodes in the graph
+F = dataset.F          # Original size of node features
+n_out = dataset.n_out  # Number of classes
+l2_reg = 5e-6          # L2 regularization rate
+learning_rate = 0.2    # Learning rate
+epochs = 20000         # Number of training epochs
+patience = 200         # Patience for early stopping
 
 # Model definition
 X_in = Input(shape=(F, ))
 fltr_in = Input((N, ), sparse=True)
-output = GraphConv(n_classes,
+output = GraphConv(n_out,
                    activation='softmax',
                    kernel_regularizer=l2(l2_reg),
                    use_bias=False)([X_in, fltr_in])
@@ -55,24 +66,19 @@
 model.summary()
 
 # Train model
-validation_data = ([X, fltr], y, val_mask)
-model.fit([X, fltr],
-          y,
-          sample_weight=train_mask,
+loader_tr = SingleLoader(dataset, sample_weights=mask_tr)
+loader_va = SingleLoader(dataset, sample_weights=mask_va)
+model.fit(loader_tr.tf(),
+          steps_per_epoch=loader_tr.steps_per_epoch,
+          validation_data=loader_va.tf(),
+          validation_steps=loader_va.steps_per_epoch,
           epochs=epochs,
-          batch_size=N,
-          validation_data=validation_data,
-          shuffle=False,  # Shuffling data means shuffling the whole graph
-          callbacks=[
-              EarlyStopping(patience=es_patience,  restore_best_weights=True)
-          ])
+          callbacks=[EarlyStopping(patience=patience, restore_best_weights=True)])
 
 # Evaluate model
 print('Evaluating model.')
-eval_results = model.evaluate([X, fltr],
-                              y,
-                              sample_weight=test_mask,
-                              batch_size=N)
+loader_te = SingleLoader(dataset, sample_weights=mask_te)
+eval_results = model.evaluate(loader_te.tf(), steps=loader_te.steps_per_epoch)
 print('Done.\n'
       'Test loss: {}\n'
       'Test accuracy: {}'.format(*eval_results))
diff --git a/spektral/data/dataset.py b/spektral/data/dataset.py
index 67301f6e..b609761c 100644
--- a/spektral/data/dataset.py
+++ b/spektral/data/dataset.py
@@ -52,6 +52,10 @@ def __init__(self, transforms=None, **kwargs):
         if len(self.graphs) == 0:
             raise ValueError('Datasets cannot be empty')
 
+        if len(self.graphs) == 1 or len(set([g.N for g in self.graphs])) == 1:
+            self.N = self.graphs[0].N
+        else:
+            self.N = None
         self.F = None
         self.S = None
         self.n_out = None
@@ -65,9 +69,11 @@ def __init__(self, transforms=None, **kwargs):
         if transforms is not None:
             if not isinstance(transforms, (list, tuple)) and callable(transforms):
                 transforms = [transforms]
-            else:
+            elif not all([callable(t) for t in transforms]):
                 raise ValueError('transforms must be a list of callables or '
                                  'a callable.')
+            else:
+                pass
             for t in transforms:
                 self.apply(t)
 
diff --git a/spektral/data/loaders.py b/spektral/data/loaders.py
index a7faf4e2..e65b74e4 100644
--- a/spektral/data/loaders.py
+++ b/spektral/data/loaders.py
@@ -51,6 +51,31 @@ def _pack(self, batch):
         return [list(elem) for elem in zip(*[g.numpy() for g in batch])]
 
 
+class SingleLoader(Loader):
+    """
+    A [Loader]() for single mode.
+    """
+    def __init__(self, dataset, epochs=None, sample_weights=None):
+        self.sample_weights = sample_weights
+        super().__init__(dataset, batch_size=1, epochs=epochs, shuffle=False)
+
+    def collate(self, batch):
+        graph = batch[0]
+        output = graph.numpy()
+        output = [output[:-1], output[-1]]
+        if self.sample_weights is not None:
+            output += [self.sample_weights]
+        return tuple(output)
+
+    def tf(self):
+        graph = self.dataset[0]
+        tensors = [(graph.x, graph.adj), graph.y]
+        if self.sample_weights is not None:
+            tensors += [self.sample_weights]
+        tensors = tuple(tensors)
+        return tf.data.Dataset.from_tensors(tensors).repeat(self.epochs)
+
+
 class DisjointLoader(Loader):
     """
     A [Loader](https://graphneural.network/) for disjoint mode.
@@ -73,7 +98,7 @@ def tf(self):
                                'or greater.')
         signature = copy.deepcopy(self.dataset.signature)
         if 'y' in signature:
-            # Edge attributes have an extra None dimension in batch mode
+            # Targets have an extra None dimension in batch mode
             signature['y']['shape'] = prepend_none(signature['y']['shape'])
 
         if 'a' in signature:
diff --git a/spektral/datasets/__init__.py b/spektral/datasets/__init__.py
index c44da679..bc28a7f0 100644
--- a/spektral/datasets/__init__.py
+++ b/spektral/datasets/__init__.py
@@ -1,6 +1,6 @@
-from . import citation
+from .citation import Citation
 from . import graphsage
 from . import mnist
-from . import ogb
-from . import qm9
-from . import tudataset
+from .ogb import OGB
+from .qm9 import QM9
+from .tudataset import TUDataset
diff --git a/spektral/datasets/citation.py b/spektral/datasets/citation.py
index c60cbfc6..29b1a62f 100644
--- a/spektral/datasets/citation.py
+++ b/spektral/datasets/citation.py
@@ -1,137 +1,133 @@
-"""
-The MIT License
-
-Copyright (c) 2016 Thomas Kipf
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-
-This code was taken almost verbatim from https://github.com/tkipf/gcn/ and
-adapted to work in Spektral.
-"""
 import os
+import os.path as osp
 
 import networkx as nx
 import numpy as np
 import requests
 import scipy.sparse as sp
 
+from spektral.data import Dataset, Graph
 from spektral.utils.io import load_binary
 
-DATA_URL = 'https://github.com/tkipf/gcn/raw/master/gcn/data/{}'
-DATA_PATH = os.path.expanduser('~/.spektral/datasets/')
-AVAILABLE_DATASETS = {'cora', 'citeseer', 'pubmed'}
 
-
-def load_data(dataset_name='cora', normalize_features=True, random_split=False):
+class Citation(Dataset):
     """
-    Loads a citation dataset (Cora, Citeseer or Pubmed) using the "Planetoid"
-    splits intialliy defined in [Yang et al. (2016)](https://arxiv.org/abs/1603.08861).
-    The train, test, and validation splits are given as binary masks.
+    The citation datasets Cora, Citeseer and Pubmed.
 
     Node attributes are bag-of-words vectors representing the most common words
     in the text document associated to each node.
     Two papers are connected if either one cites the other.
     Labels represent the class of the paper.
+    The train, test, and validation splits are given as binary masks and are
+    accessible with the `mask_tr`, `mask_va`, and `mask_te` respectively.
+
+    **Arguments**
 
-    :param dataset_name: name of the dataset to load (`'cora'`, `'citeseer'`, or
+    - `name`: name of the dataset to load (`'cora'`, `'citeseer'`, or
     `'pubmed'`);
-    :param normalize_features: if True, the node features are normalized;
-    :param random_split: if True, return a randomized split (20 nodes per class
+    - `random_split`: if True, return a randomized split (20 nodes per class
     for training, 30 nodes per class for validation and the remaining nodes for
-    testing, [Shchur et al. (2018)](https://arxiv.org/abs/1811.05868)).
-    :return:
-        - Adjacency matrix;
-        - Node features;
-        - Labels;
-        - Three binary masks for train, validation, and test splits.
+    testing, as recommended by [Shchur et al. (2018)](https://arxiv.org/abs/1811.05868)).
+    If False (default), return the "Planetoid" public splits defined by
+    [Yang et al. (2016)](https://arxiv.org/abs/1603.08861).
+    - `normalize_x`: if True, normalize the features.
     """
-    if dataset_name not in AVAILABLE_DATASETS:
-        raise ValueError('Available datasets: {}'.format(AVAILABLE_DATASETS))
-
-    if not os.path.exists(DATA_PATH + dataset_name):
-        _download_data(dataset_name)
-
-    print('Loading {} dataset'.format(dataset_name))
-
-    names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph']
-    objects = []
-    data_path = os.path.join(DATA_PATH, dataset_name)
-    for n in names:
-        filename = os.path.join(data_path, 'ind.{}.{}'.format(dataset_name, n))
-        objects.append(load_binary(filename))
-
-    x, y, tx, ty, allx, ally, graph = tuple(objects)
-    adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))
-    test_idx_reorder = _parse_index_file(
-        os.path.join(data_path, "ind.{}.test.index".format(dataset_name)))
-    test_idx_range = np.sort(test_idx_reorder)
-
-    if dataset_name == 'citeseer':
-        test_idx_range_full = range(min(test_idx_reorder),
-                                    max(test_idx_reorder) + 1)
-        tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1]))
-        tx_extended[test_idx_range - min(test_idx_range), :] = tx
-        tx = tx_extended
-        ty_extended = np.zeros((len(test_idx_range_full), y.shape[1]))
-        ty_extended[test_idx_range - min(test_idx_range), :] = ty
-        ty = ty_extended
-
-    features = sp.vstack((allx, tx)).tolil()
-    features[test_idx_reorder, :] = features[test_idx_range, :]
-
-    # Row-normalize the features
-    if normalize_features:
-        print('Pre-processing node features')
-        features = _preprocess_features(features)
-
-    labels = np.vstack((ally, ty))
-    labels[test_idx_reorder, :] = labels[test_idx_range, :]
-
-    # Data splits
-    if random_split:
-        from sklearn.model_selection import train_test_split
-        indices = np.arange(labels.shape[0])
-        n_classes = labels.shape[1]
-        idx_train, idx_test, y_train, y_test = train_test_split(
-            indices, labels, train_size=20 * n_classes, stratify=labels)
-        idx_val, idx_test, y_val, y_test = train_test_split(
-            idx_test, y_test, train_size=30 * n_classes, stratify=y_test)
+    url = 'https://github.com/tkipf/gcn/raw/master/gcn/data/{}'
+    suffixes = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph', 'test.index']
+
+    def __init__(self, name, random_split=False, normalize_x=False, **kwargs):
+        self.name = name.lower()
+        if self.name not in self.available_datasets():
+            raise ValueError('Unknown dataset {}. See Citation.available_datasets() '
+                             'for a list of available datasets.')
+        self.random_split = random_split
+        self.normalize_x = normalize_x
+        self.mask_tr = self.mask_va = self.mask_te = None
+        super().__init__(**kwargs)
+
+    @property
+    def path(self):
+        return osp.join(super(Citation, self).path, self.name)
+
+    def read(self):
+        objects = [_read_file(self.path, self.name, s) for s in self.suffixes]
+        objects = [o.A if sp.issparse(o) else o for o in objects]
+        x, y, tx, ty, allx, ally, graph, idx_te = objects
+
+        # Public Planetoid splits. This is the default
+        idx_tr = np.arange(y.shape[0])
+        idx_va = np.arange(y.shape[0], y.shape[0] + 500)
+        idx_te = idx_te.astype(int)
+        idx_te_sort = np.sort(idx_te)
+
+        # Fix disconnected nodes in Citeseer
+        if self.name == 'citeseer':
+            idx_te_len = idx_te.max() - idx_te.min() + 1
+            tx_ext = np.zeros((idx_te_len, x.shape[1]))
+            tx_ext[idx_te_sort - idx_te.min(), :] = tx
+            tx = tx_ext
+            ty_ext = np.zeros((idx_te_len, y.shape[1]))
+            ty_ext[idx_te_sort - idx_te.min(), :] = ty
+            ty = ty_ext
+
+        x = np.vstack((allx, tx))
+        y = np.vstack((ally, ty))
+        x[idx_te, :] = x[idx_te_sort, :]
+        y[idx_te, :] = y[idx_te_sort, :]
+
+        # Row-normalize the features
+        if self.normalize_x:
+            print('Pre-processing node features')
+            x = _preprocess_features(x)
+
+        if self.random_split:
+            # Throw away public splits and compute random ones like Shchur et al.
+            from sklearn.model_selection import train_test_split
+            indices = np.arange(y.shape[0])
+            n_classes = y.shape[1]
+            idx_tr, idx_te, y_tr, y_te = train_test_split(
+                indices, y, train_size=20 * n_classes, stratify=y)
+            idx_va, idx_te, y_va, y_te = train_test_split(
+                idx_te, y_te, train_size=30 * n_classes, stratify=y_te)
+
+        # Adjacency matrix
+        adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))
+        adj.setdiag(0)
+        adj.eliminate_zeros()
+
+        # Train/valid/test masks
+        self.mask_tr = _idx_to_mask(idx_tr, y.shape[0])
+        self.mask_va = _idx_to_mask(idx_va, y.shape[0])
+        self.mask_te = _idx_to_mask(idx_te, y.shape[0])
+
+        return [Graph(x=x, adj=adj, y=y)]
+
+    def download(self):
+        print('Downloading {} dataset.'.format(self.name))
+        os.makedirs(self.path, exist_ok=True)
+        for n in self.suffixes:
+            f_name = 'ind.{}.{}'.format(self.name, n)
+            req = requests.get(self.url.format(f_name))
+            if req.status_code == 404:
+                raise ValueError('Cannot download dataset ({} returned 404).'
+                                 .format(self.url.format(f_name)))
+            with open(os.path.join(self.path, f_name), 'wb') as out_file:
+                out_file.write(req.content)
+
+    @staticmethod
+    def available_datasets():
+        return ['cora', 'citeseer', 'pubmed']
+
+
+def _read_file(path, name, suffix):
+    full_fname = os.path.join(path, 'ind.{}.{}'.format(name, suffix))
+    if suffix == 'test.index':
+        return np.loadtxt(full_fname)
     else:
-        idx_test = test_idx_range.tolist()
-        idx_train = range(len(y))
-        idx_val = range(len(y), len(y) + 500)
-
-    train_mask = _sample_mask(idx_train, labels.shape[0])
-    val_mask = _sample_mask(idx_val, labels.shape[0])
-    test_mask = _sample_mask(idx_test, labels.shape[0])
-
-    return adj, features, labels, train_mask, val_mask, test_mask
-
-
-def _parse_index_file(filename):
-    index = []
-    for line in open(filename):
-        index.append(int(line.strip()))
-    return index
+        return load_binary(full_fname)
 
 
-def _sample_mask(idx, l):
+def _idx_to_mask(idx, l):
     mask = np.zeros(l)
     mask[idx] = 1
     return np.array(mask, dtype=np.bool)
@@ -144,17 +140,3 @@ def _preprocess_features(features):
     r_mat_inv = sp.diags(r_inv)
     features = r_mat_inv.dot(features)
     return features
-
-
-def _download_data(dataset_name):
-    names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph', 'test.index']
-
-    os.makedirs(os.path.join(DATA_PATH, dataset_name))
-
-    print('Downloading', dataset_name, 'from', DATA_URL[:-2])
-    for n in names:
-        f_name = 'ind.{}.{}'.format(dataset_name, n)
-        req = requests.get(DATA_URL.format(f_name))
-
-        with open(os.path.join(DATA_PATH, dataset_name, f_name), 'wb') as out_file:
-            out_file.write(req.content)
diff --git a/spektral/transforms/__init__.py b/spektral/transforms/__init__.py
new file mode 100644
index 00000000..4fa080fc
--- /dev/null
+++ b/spektral/transforms/__init__.py
@@ -0,0 +1,5 @@
+from .adj_to_sp_tensor import AdjToSpTensor
+from .degree import Degree, MaxDegree
+from .gcn_filter import GCNFilter
+from .layer_preprocess import LayerPreprocess
+from .normalize_adj import NormalizeAdj
\ No newline at end of file
diff --git a/spektral/transforms/gcn_filter.py b/spektral/transforms/gcn_filter.py
new file mode 100644
index 00000000..e2168425
--- /dev/null
+++ b/spektral/transforms/gcn_filter.py
@@ -0,0 +1,12 @@
+from spektral.utils import gcn_filter
+
+
+class GCNFilter(object):
+    def __init__(self, symmetric=True):
+        self.symmetric = symmetric
+
+    def __call__(self, graph):
+        if graph.adj is not None:
+            graph.adj = gcn_filter(graph.adj, self.symmetric)
+
+        return graph
diff --git a/spektral/transforms/normalize_adj.py b/spektral/transforms/normalize_adj.py
new file mode 100644
index 00000000..308be920
--- /dev/null
+++ b/spektral/transforms/normalize_adj.py
@@ -0,0 +1,12 @@
+from spektral.utils import normalized_adjacency
+
+
+class NormalizeAdj(object):
+    def __init__(self, symmetric=True):
+        self.symmetric = symmetric
+
+    def __call__(self, graph):
+        if graph.adj is not None:
+            graph.adj = normalized_adjacency(graph.adj, self.symmetric)
+
+        return graph
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index c5bc72a0..16e9c64f 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -4,22 +4,10 @@
 batch_size = 3
 
 
-def correctly_padded(adj, nf, ef):
-    assert adj.ndim == 3
-    assert adj.shape[-1] == adj.shape[-2]
-    if nf is not None:
-        assert nf.ndim == 3
-        assert adj.shape[-1] == nf.shape[-2]
-    if ef is not None:
-        assert ef.ndim == 4
-        assert adj.shape[-1] == ef.shape[-2]
-        assert adj.shape[-1] == ef.shape[-3]
-
-
 def test_citation():
-    for dataset_name in ['cora', 'citeseer', 'pubmed']:
-        citation.load_data(dataset_name)
-        citation.load_data(dataset_name, random_split=True)
+    dataset = citation.Citation('cora')
+    dataset = citation.Citation('citeseer', random_split=True)
+    dataset = citation.Citation('pubmed', normalize_x=True)
 
 
 def test_graphsage():

From 52083e563929c1c43d185132c1f53054e8b2168d Mon Sep 17 00:00:00 2001
From: Daniele Grattarola <daniele.grattarola@gmail.com>
Date: Tue, 3 Nov 2020 18:47:46 +0100
Subject: [PATCH 12/57] Fix issues with loaders

---
 spektral/data/loaders.py        |  9 +++----
 tests/test_data/test_loaders.py | 42 +++++++++++++++++++++++++++++----
 tests/test_data/test_utils.py   |  9 +++++--
 3 files changed, 47 insertions(+), 13 deletions(-)

diff --git a/spektral/data/loaders.py b/spektral/data/loaders.py
index e65b74e4..8e3f9712 100644
--- a/spektral/data/loaders.py
+++ b/spektral/data/loaders.py
@@ -60,6 +60,7 @@ def __init__(self, dataset, epochs=None, sample_weights=None):
         super().__init__(dataset, batch_size=1, epochs=epochs, shuffle=False)
 
     def collate(self, batch):
+        # TODO how to deal with edge attrs
         graph = batch[0]
         output = graph.numpy()
         output = [output[:-1], output[-1]]
@@ -68,12 +69,8 @@ def collate(self, batch):
         return tuple(output)
 
     def tf(self):
-        graph = self.dataset[0]
-        tensors = [(graph.x, graph.adj), graph.y]
-        if self.sample_weights is not None:
-            tensors += [self.sample_weights]
-        tensors = tuple(tensors)
-        return tf.data.Dataset.from_tensors(tensors).repeat(self.epochs)
+        output = self.collate(self.dataset)
+        return tf.data.Dataset.from_tensors(output).repeat(self.epochs)
 
 
 class DisjointLoader(Loader):
diff --git a/tests/test_data/test_loaders.py b/tests/test_data/test_loaders.py
index 4a360f80..d4576bb5 100644
--- a/tests/test_data/test_loaders.py
+++ b/tests/test_data/test_loaders.py
@@ -1,9 +1,10 @@
 import numpy as np
+import scipy.sparse as sp
 
 from spektral.data import DisjointLoader, BatchLoader
 from spektral.data.dataset import Dataset
 from spektral.data.graph import Graph
-from spektral.data.loaders import PackedBatchLoader
+from spektral.data.loaders import PackedBatchLoader, SingleLoader
 
 n_graphs = 10
 ns = np.random.randint(3, 8, n_graphs)
@@ -16,20 +17,51 @@
 assert graphs_in_batch != 0
 
 
+class TestDatasetSingle(Dataset):
+    """
+    A dataset with a single graph.
+    """
+    def read(self):
+        n = 10
+        return [
+            Graph(x=np.random.rand(n, f),
+                  adj=sp.coo_matrix(np.random.randint(0, 2, (n, n))),
+                  edge_attr=np.random.rand(n, n, s),
+                  y=np.array(n * [[0., 1.]]))
+        ]
+
+
 class TestDataset(Dataset):
+    """
+    A dataset with many graphs
+    """
     def read(self):
         return [
             Graph(x=np.random.rand(n, f),
-                  adj=np.random.randint(0, 2, (n, n)),
+                  adj=sp.coo_matrix(np.random.randint(0, 2, (n, n))),
                   edge_attr=np.random.rand(n, n, s),
                   y=np.array([0., 1.]))
             for n in ns
         ]
 
 
+def test_single():
+    data = TestDatasetSingle()
+    n = data.N
+    loader = SingleLoader(data, sample_weights=np.ones(n), epochs=1)
+    batches = [b for b in loader]
+    assert len(batches) == 1
+
+    (x, a, e), y, sw = batches[0]
+    assert x.shape == (n, f)
+    assert a.shape == (n, n)
+    assert len(e.shape) == 3 and e.shape[-1] == s  # Avoid counting edges
+    assert y.shape == (n, 2)
+
+
 def test_disjoint():
     data = TestDataset()
-    loader = DisjointLoader(data, batch_size=batch_size)
+    loader = DisjointLoader(data, batch_size=batch_size, epochs=1, shuffle=False)
     batches = [b for b in loader]
 
     (x, a, e, i), y = batches[-1]
@@ -43,7 +75,7 @@ def test_disjoint():
 
 def test_batch():
     data = TestDataset()
-    loader = BatchLoader(data, batch_size=batch_size)
+    loader = BatchLoader(data, batch_size=batch_size, epochs=1, shuffle=False)
     batches = [b for b in loader]
 
     (x, a, e), y = batches[-1]
@@ -56,7 +88,7 @@ def test_batch():
 
 def test_fast_batch():
     data = TestDataset()
-    loader = PackedBatchLoader(data, batch_size=batch_size)
+    loader = PackedBatchLoader(data, batch_size=batch_size, epochs=1, shuffle=False)
     batches = [b for b in loader]
 
     (x, a, e), y = batches[-1]
diff --git a/tests/test_data/test_utils.py b/tests/test_data/test_utils.py
index 2be165f4..d7721793 100644
--- a/tests/test_data/test_utils.py
+++ b/tests/test_data/test_utils.py
@@ -2,9 +2,14 @@
 
 from spektral.data import Dataset, Graph
 from spektral.data.utils import to_disjoint, to_batch, batch_generator
-from spektral.datasets import tudataset
+import scipy.sparse as sp
+import numpy as np
 
-a_list, x_list, y = tudataset.load_data('ENZYMES', clean=True)
+ns = np.random.randint(3, 10, 10)
+f = 3
+a_list = [sp.coo_matrix(np.ones((n, n))) for n in ns]
+x_list = [np.random.rand(n, f) for n in ns]
+y = [[0, 1]] * len(ns)
 
 
 def test_to_batch():

From 007aac148914f1a85d6f57996f4409758d5b72d0 Mon Sep 17 00:00:00 2001
From: Daniele Grattarola <daniele.grattarola@gmail.com>
Date: Wed, 4 Nov 2020 09:18:36 +0100
Subject: [PATCH 13/57] Fix qm9 link

---
 spektral/data/loaders.py | 1 -
 spektral/datasets/qm9.py | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/spektral/data/loaders.py b/spektral/data/loaders.py
index 8e3f9712..e8ced854 100644
--- a/spektral/data/loaders.py
+++ b/spektral/data/loaders.py
@@ -60,7 +60,6 @@ def __init__(self, dataset, epochs=None, sample_weights=None):
         super().__init__(dataset, batch_size=1, epochs=epochs, shuffle=False)
 
     def collate(self, batch):
-        # TODO how to deal with edge attrs
         graph = batch[0]
         output = graph.numpy()
         output = [output[:-1], output[-1]]
diff --git a/spektral/datasets/qm9.py b/spektral/datasets/qm9.py
index 6a7a2aa0..420066ae 100644
--- a/spektral/datasets/qm9.py
+++ b/spektral/datasets/qm9.py
@@ -40,7 +40,7 @@ class QM9(Dataset):
     - `amount`: int, load this many molecules instead of the full dataset
     (useful for debugging).
     """
-    url = 'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/gdb9.tar.gz'
+    url = 'https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/gdb9.tar.gz'
 
     def __init__(self, amount=None, **kwargs):
         self.amount = amount

From 5aad62d34082338e57855936ae570b02d3d8068b Mon Sep 17 00:00:00 2001
From: Daniele Grattarola <daniele.grattarola@gmail.com>
Date: Wed, 4 Nov 2020 11:52:59 +0100
Subject: [PATCH 14/57] Convert GraphSage datasets to Dataset interface

---
 spektral/datasets/citation.py  |   2 +-
 spektral/datasets/graphsage.py | 370 ++++++++++++++-------------------
 tests/test_datasets.py         |   5 +-
 3 files changed, 160 insertions(+), 217 deletions(-)

diff --git a/spektral/datasets/citation.py b/spektral/datasets/citation.py
index 29b1a62f..c9500cae 100644
--- a/spektral/datasets/citation.py
+++ b/spektral/datasets/citation.py
@@ -17,7 +17,7 @@ class Citation(Dataset):
     Node attributes are bag-of-words vectors representing the most common words
     in the text document associated to each node.
     Two papers are connected if either one cites the other.
-    Labels represent the class of the paper.
+    Labels represent the subject area of the paper.
     The train, test, and validation splits are given as binary masks and are
     accessible with the `mask_tr`, `mask_va`, and `mask_te` respectively.
 
diff --git a/spektral/datasets/graphsage.py b/spektral/datasets/graphsage.py
index 8eb8d20f..ac2bde96 100644
--- a/spektral/datasets/graphsage.py
+++ b/spektral/datasets/graphsage.py
@@ -1,54 +1,7 @@
-"""
-The MIT License
-
-Copyright (c) 2017 William L. Hamilton, Rex Ying
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-
-Portions of this code base were orginally forked from: https://github.com/tkipf/gcn, which is under the following License:
-
-Copyright (c) 2016 Thomas Kipf
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-
-Note from Spektral's authors: the code by Hamilton et al. was adapted and the
-present version is not a verbatim copy.
-"""
-
 import json
 import os
+import os.path as osp
+import shutil
 import zipfile
 
 import numpy as np
@@ -56,16 +9,17 @@
 import scipy.sparse as sp
 from networkx.readwrite import json_graph
 
-DATA_PATH = os.path.expanduser('~/.spektral/datasets/')
-AVAILABLE_DATASETS = {'ppi', 'reddit'}
+from spektral.data import Dataset, Graph
+from spektral.data.dataset import DATASET_FOLDER
 
 
-def load_data(dataset_name, max_degree=-1, normalize_features=True):
+class GraphSage(Dataset):
     """
-    Loads one of the datasets (PPI or Reddit) used in
-    [Hamilton & Ying (2017)](https://arxiv.org/abs/1706.02216).
+    The datasets used in the GraphSage paper 
+    [(Hamilton & Ying (2017))](https://arxiv.org/abs/1706.02216): PPI and Reddit.
 
-    The PPI dataset (originally [Stark et al. (2006)](https://www.ncbi.nlm.nih.gov/pubmed/16381927))
+    The PPI dataset (originally
+    [Stark et al. (2006)](https://www.ncbi.nlm.nih.gov/pubmed/16381927))
     for inductive node classification uses positional gene sets, motif gene sets
     and immunological signatures as features and gene ontology sets as labels.
 
@@ -76,171 +30,161 @@ def load_data(dataset_name, max_degree=-1, normalize_features=True):
     are obtained by concatenating the average GloVe CommonCrawl vectors of
     the title and comments, the post's score and the number of comments.
 
-    The train, test, and validation splits are returned as binary masks.
-
-    :param dataset_name: name of the dataset to load (`'ppi'`, or `'reddit'`);
-    :param max_degree: int, if positive, subsample edges so that each node has
-    the specified maximum degree.
-    :param normalize_features: if True, the node features are normalized;
-    :return:
-        - Adjacency matrix;
-        - Node features;
-        - Labels;
-        - Three binary masks for train, validation, and test splits.
-    """
-    prefix = DATA_PATH + dataset_name + '/' + dataset_name
-    if max_degree == -1:
-        npz_file = prefix + '.npz'
-    else:
-        npz_file = '{}_deg{}.npz'.format(prefix, max_degree)
+    The train, test, and validation splits are given as binary masks and are
+    accessible with the `mask_tr`, `mask_va`, and `mask_te` respectively.
 
-    if not os.path.exists(prefix + "-G.json"):
-        _download_data(dataset_name)
+    **Arguments**
 
-    if os.path.exists(npz_file):
-        # Data already prepreccesed
-        print('Loading pre-processed dataset {}.'.format(npz_file))
+    - `name`: name of the dataset to load (`'ppi'`, or `'reddit'`);
+    """
+    # TODO normalize features?
+    # # # Z-score on features (optional)
+    # if normalize_features:
+    #     from sklearn.preprocessing import StandardScaler
+    #     train_ids = np.array([id_map[n] for n in G.nodes()
+    #                           if not G.nodes[n]['val'] and not G.nodes[n]['test']])
+    #     x_tr = x[train_ids]
+    #     scaler = StandardScaler()
+    #     scaler.fit(x_tr)
+    #     x = scaler.transform(x)
+
+    url = 'http://snap.stanford.edu/graphsage/{}.zip'
+
+    def __init__(self, name, **kwargs):
+        if name.lower() not in self.available_datasets():
+            raise ValueError('Unknown dataset: {}. Possible: {}'
+                             .format(name, self.available_datasets()))
+        self.name = name.lower()
+        self.mask_tr = self.mask_va = self.mask_te = None
+        super().__init__(**kwargs)
+
+    @property
+    def path(self):
+        return osp.join(DATASET_FOLDER, 'GraphSage', self.name)
+
+    def read(self):
+        npz_file = osp.join(self.path, self.name) + '.npz'
         data = np.load(npz_file)
-        feats = data['feats']
-        labels = data['labels']
-        train_mask = data['train_mask']
-        val_mask = data['val_mask']
-        test_mask = data['test_mask']
-        full_adj = sp.csr_matrix(
-            (data['full_adj_data'], data['full_adj_indices'], data['full_adj_indptr']),
-            shape=data['full_adj_shape']
-        )
-    else:
-        # Preprocess data
-        print('Loading dataset.')
-        G_data = json.load(open(prefix + "-G.json"))
-        G = json_graph.node_link_graph(G_data)
-        feats = np.load(prefix + "-feats.npy").astype(np.float32)
-        id_map = json.load(open(prefix + "-id_map.json"))
-        if list(id_map.keys())[0].isdigit():
-            conversion = lambda n: int(n)
-        else:
-            conversion = lambda n: n
-        id_map = {conversion(k): int(v) for k, v in id_map.items()}
-        class_map = json.load(open(prefix + "-class_map.json"))
-        if isinstance(list(class_map.values())[0], list):
-            lab_conversion = lambda n: n
-        else:
-            lab_conversion = lambda n: int(n)
-
-        class_map = {conversion(k): lab_conversion(v) for k, v in class_map.items()}
-
-        # Remove all nodes that do not have val/test annotations
-        # (necessary because of networkx weirdness with the Reddit data)
-        broken_count = 0
-        to_remove = []
-        for node in G.nodes():
-            if node not in id_map:
-                to_remove.append(node)
-                broken_count += 1
-        for node in to_remove:
-            G.remove_node(node)
-        print(
-            "Removed {:d} nodes that lacked proper annotations due to networkx versioning issues"
-            .format(broken_count)
+        x = data['x']
+        adj = sp.coo_matrix(
+            (data['adj_data'], (data['adj_row'], data['adj_col'])),
+            shape=data['adj_shape']
         )
+        y = data['y']
+        self.mask_tr = data['mask_tr']
+        self.mask_va = data['mask_va']
+        self.mask_te = data['mask_te']
+
+        return [Graph(x=x, adj=adj, y=y)]
+
+    def download(self):
+        print('Dowloading', self.name, 'dataset.')
+        url = self.url.format(self.name)
+        req = requests.get(url)
+        os.makedirs(self.path, exist_ok=True)
+
+        fname = osp.join(self.path, self.name + '.zip')
+        with open(fname, 'wb') as of:
+            of.write(req.content)
+        with zipfile.ZipFile(fname, 'r') as of:
+            of.extractall(self.path)
+
+        # Datasets are zipped in a folder: unpack them
+        parent = self.path
+        subfolder = osp.join(self.path, self.name)
+        for filename in os.listdir(subfolder):
+            shutil.move(osp.join(subfolder, filename), osp.join(parent, filename))
+        os.rmdir(subfolder)
+
+        x, adj, y, mask_tr, mask_va, mask_te = preprocess_data(self.path, self.name)
+
+        # Save pre-processed data
+        npz_file = osp.join(self.path, self.name) + '.npz'
+        np.savez(npz_file, x=x, adj_data=adj.data, adj_row=adj.row,
+                 adj_col=adj.col, adj_shape=adj.shape, y=y,
+                 mask_tr=mask_tr, mask_va=mask_va, mask_te=mask_te)
+
+    @staticmethod
+    def available_datasets():
+        return ['ppi', 'reddit']
+
+
+class PPI(GraphSage):
+    """
+    Alias for `GraphSage('ppi')`.
+    """
+    def __init__(self, **kwargs):
+        super().__init__(name='ppi', **kwargs)
+
+
+class Reddit(GraphSage):
+    """
+    Alias for `GraphSage('reddit')`.
+    """
+    def __init__(self, **kwargs):
+        super().__init__(name='reddit', **kwargs)
+
+
+def preprocess_data(path, name):
+    """
+    Code adapted from https://github.com/williamleif/GraphSAGE
+    """
+    print('Processing dataset.')
+    prefix = osp.join(path, name)
+
+    G_data = json.load(open(prefix + "-G.json"))
+    G = json_graph.node_link_graph(G_data)
+
+    x = np.load(prefix + "-feats.npy").astype(np.float32)
 
-        # Construct adjacency matrix
-        edges = []
-        for edge in G.edges():
-            if edge[0] in id_map and edge[1] in id_map:
-                edges.append((id_map[edge[0]], id_map[edge[1]]))
-        print('{} edges'.format(len(edges)))
-        num_data = len(id_map)
-
-        # Subsample edges (optional)
-        if max_degree > -1:
-            print('Subsampling edges.')
-            edges = _subsample_edges(edges, num_data, max_degree)
-
-        # Get train/val/test indexes
-        val_data = np.array([id_map[n] for n in G.nodes()
-                             if G.nodes[n]['val']], dtype=np.int32)
-        test_data = np.array([id_map[n] for n in G.nodes()
-                              if G.nodes[n]['test']], dtype=np.int32)
-        train_mask = np.ones((num_data), dtype=np.bool)
-        train_mask[val_data] = False
-        train_mask[test_data] = False
-        val_mask = np.zeros((num_data), dtype=np.bool)
-        val_mask[val_data] = True
-        test_mask = np.zeros((num_data), dtype=np.bool)
-        test_mask[test_data] = True
-
-        edges = np.array(edges, dtype=np.int32)
-
-        def _get_adj(edges):
-            adj = sp.csr_matrix((np.ones((edges.shape[0]), dtype=np.float32),
-                                 (edges[:, 0], edges[:, 1])), shape=(num_data, num_data))
-            adj = adj.maximum(adj.transpose())
-            return adj
-
-        full_adj = _get_adj(edges)
-
-        # Z-score on features (optional)
-        if normalize_features:
-            from sklearn.preprocessing import StandardScaler
-            train_ids = np.array([id_map[n] for n in G.nodes()
-                                  if not G.nodes[n]['val'] and not G.nodes[n]['test']])
-            train_feats = feats[train_ids]
-            scaler = StandardScaler()
-            scaler.fit(train_feats)
-            feats = scaler.transform(feats)
-
-        # Process labels
-        if isinstance(list(class_map.values())[0], list):
-            num_classes = len(list(class_map.values())[0])
-            labels = np.zeros((num_data, num_classes), dtype=np.float32)
-            for k in class_map.keys():
-                labels[id_map[k], :] = np.array(class_map[k])
-        else:
-            num_classes = len(set(class_map.values()))
-            labels = np.zeros((num_data, num_classes), dtype=np.float32)
-            for k in class_map.keys():
-                labels[id_map[k], class_map[k]] = 1
-
-        with open(npz_file, 'wb') as fwrite:
-            print('Saving {} edges'.format(full_adj.nnz))
-            np.savez(fwrite, num_data=num_data,
-                     full_adj_data=full_adj.data, full_adj_indices=full_adj.indices, full_adj_indptr=full_adj.indptr,
-                     full_adj_shape=full_adj.shape,
-                     feats=feats,
-                     labels=labels,
-                     train_mask=train_mask, val_mask=val_mask, test_mask=test_mask)
-
-    return full_adj, feats, labels, train_mask, val_mask, test_mask
-
-
-def _download_data(dataset_name):
-    print('Dowloading ' + dataset_name + ' dataset.')
-    if dataset_name == 'ppi':
-        data_url = 'http://snap.stanford.edu/graphsage/ppi.zip'
-    elif dataset_name == 'reddit':
-        data_url = 'http://snap.stanford.edu/graphsage/reddit.zip'
+    id_map = json.load(open(prefix + "-id_map.json"))
+    if list(id_map.keys())[0].isdigit():
+        conversion = lambda n: int(n)
     else:
-        raise ValueError('dataset_name must be one of: {}'.format(AVAILABLE_DATASETS))
-    req = requests.get(data_url)
+        conversion = lambda n: n
+    id_map = {conversion(k): int(v) for k, v in id_map.items()}
+    n = len(id_map)
 
-    os.makedirs(DATA_PATH, exist_ok=True)
-    with open(DATA_PATH + dataset_name + '.zip', 'wb') as out_file:
-        out_file.write(req.content)
-    with zipfile.ZipFile(DATA_PATH + dataset_name + '.zip', 'r') as zip_ref:
-        zip_ref.extractall(DATA_PATH)
+    class_map = json.load(open(prefix + "-class_map.json"))
+    if isinstance(list(class_map.values())[0], list):
+        lab_conversion = lambda n: n
+    else:
+        lab_conversion = lambda n: int(n)
+    class_map = {conversion(k): lab_conversion(v) for k, v in class_map.items()}
 
+    # Remove all nodes that do not have val/test annotations
+    [G.remove_node(node) for node in G.nodes() if node not in id_map]
 
-def _subsample_edges(edges, num_data, max_degree):
+    # Adjacency matrix
+    edges = [(id_map[edge[0]], id_map[edge[1]])
+             for edge in G.edges()
+             if edge[0] in id_map and edge[1] in id_map]
     edges = np.array(edges, dtype=np.int32)
-    np.random.shuffle(edges)
-    degree = np.zeros(num_data, dtype=np.int32)
-
-    new_edges = []
-    for e in edges:
-        if degree[e[0]] < max_degree and degree[e[1]] < max_degree:
-            new_edges.append((e[0], e[1]))
-            degree[e[0]] += 1
-            degree[e[1]] += 1
-    return new_edges
+    adj = sp.coo_matrix((np.ones((edges.shape[0]), dtype=np.float32),
+                         (edges[:, 0], edges[:, 1])), shape=(n, n))
+    adj = adj.maximum(adj.transpose()).tocoo()
+
+    # Process labels
+    if isinstance(list(class_map.values())[0], list):
+        num_classes = len(list(class_map.values())[0])
+        y = np.zeros((n, num_classes), dtype=np.float32)
+        for k in class_map.keys():
+            y[id_map[k], :] = np.array(class_map[k])
+    else:
+        num_classes = len(set(class_map.values()))
+        y = np.zeros((n, num_classes), dtype=np.float32)
+        for k in class_map.keys():
+            y[id_map[k], class_map[k]] = 1
+
+    # Get train/val/test indexes
+    idx_va = np.array([id_map[n] for n in G.nodes() if G.nodes[n]['val']], dtype=np.int32)
+    idx_te = np.array([id_map[n] for n in G.nodes() if G.nodes[n]['test']], dtype=np.int32)
+    mask_tr = np.ones(n, dtype=np.bool)
+    mask_va = np.zeros(n, dtype=np.bool)
+    mask_te = np.zeros(n, dtype=np.bool)
+    mask_tr[idx_va] = False
+    mask_tr[idx_te] = False
+    mask_va[idx_va] = True
+    mask_te[idx_te] = True
+
+    return x, adj, y, mask_tr, mask_va, mask_te
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index 16e9c64f..723b7f5f 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -11,9 +11,8 @@ def test_citation():
 
 
 def test_graphsage():
-    for dataset_name in ['ppi']:
-        # Test only PPI because Travis otherwise fails
-        graphsage.load_data(dataset_name)
+    # Test only PPI because Travis otherwise runs into memory errors
+    dataset = graphsage.PPI()
 
 
 def test_mnist():

From 3237debe40fc748c9dcd06ea053abc888f4a26da Mon Sep 17 00:00:00 2001
From: Daniele Grattarola <daniele.grattarola@gmail.com>
Date: Wed, 4 Nov 2020 11:59:31 +0100
Subject: [PATCH 15/57] Update citation_gat_fast.py example to work well in GH
 workflow

---
 examples/node_prediction/citation_gat_fast.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/node_prediction/citation_gat_fast.py b/examples/node_prediction/citation_gat_fast.py
index 0c7b15a1..aa8c86f0 100644
--- a/examples/node_prediction/citation_gat_fast.py
+++ b/examples/node_prediction/citation_gat_fast.py
@@ -81,8 +81,9 @@ def evaluate():
 best_val_loss = 99999
 best_test_acc = 0
 current_patience = patience = 100
+epochs = 999999
 tic()
-for epoch in range(1, 99999):
+for epoch in range(1, epochs):
     train()
     l, a = evaluate()
     print('Loss tr: {:.4f}, Acc tr: {:.4f}, '

From 8399fce67038569dea8833f79032dfd307f5872b Mon Sep 17 00:00:00 2001
From: Daniele Grattarola <daniele.grattarola@gmail.com>
Date: Wed, 4 Nov 2020 12:45:26 +0100
Subject: [PATCH 16/57] Fix citation_*_fast.py examples

---
 .github/workflows/examples.yml                |  2 +-
 examples/node_prediction/citation_gat_fast.py | 39 ++++++++++---------
 examples/node_prediction/citation_gcn_fast.py | 30 +++++++-------
 3 files changed, 37 insertions(+), 34 deletions(-)

diff --git a/.github/workflows/examples.yml b/.github/workflows/examples.yml
index f4e6aeec..64f78d2e 100644
--- a/.github/workflows/examples.yml
+++ b/.github/workflows/examples.yml
@@ -38,7 +38,7 @@ jobs:
         cd examples/node_prediction/
         for c_py in citation_*.py; do
           echo "##### $c_py #####"
-          python $c_py 
+          python $c_py
         done
         cd ..
         cd graph_prediction/
diff --git a/examples/node_prediction/citation_gat_fast.py b/examples/node_prediction/citation_gat_fast.py
index aa8c86f0..5d8692c0 100644
--- a/examples/node_prediction/citation_gat_fast.py
+++ b/examples/node_prediction/citation_gat_fast.py
@@ -12,41 +12,42 @@
 from tensorflow.keras.optimizers import Adam
 from tensorflow.keras.regularizers import l2
 
-from spektral.datasets import citation
+from spektral.datasets.citation import Citation
 from spektral.layers import GraphAttention
-from spektral.layers import ops
+from spektral.transforms import LayerPreprocess, AdjToSpTensor
 from spektral.utils import tic, toc
 
 # Load data
-A, X, y, train_mask, val_mask, test_mask = citation.load_data('cora')
-fltr = A.astype('f4')
-fltr = ops.sp_matrix_to_sp_tensor(fltr)
-X = X.toarray()
+dataset = Citation('cora',
+                   transforms=[LayerPreprocess(GraphAttention), AdjToSpTensor()])
+graph = dataset[0]
+x, a, y = graph.x, graph.adj, graph.y
+mask_tr, mask_va, mask_te = dataset.mask_tr, dataset.mask_va, dataset.mask_te
 
 # Define model
-X_in = Input(shape=(X.shape[1], ))
-fltr_in = Input(shape=(X.shape[0], ), sparse=True)
-X_1 = Dropout(0.6)(X_in)
-X_1 = GraphAttention(8,
+x_in = Input(shape=(dataset.F,))
+a_in = Input(shape=(None,), sparse=True)
+x_1 = Dropout(0.6)(x_in)
+x_1 = GraphAttention(8,
                      attn_heads=8,
                      concat_heads=True,
                      dropout_rate=0.6,
                      activation='elu',
                      kernel_regularizer=l2(5e-4),
                      attn_kernel_regularizer=l2(5e-4),
-                     bias_regularizer=l2(5e-4))([X_1, fltr_in])
-X_2 = Dropout(0.6)(X_1)
-X_2 = GraphAttention(y.shape[1],
+                     bias_regularizer=l2(5e-4))([x_1, a_in])
+x_2 = Dropout(0.6)(x_1)
+x_2 = GraphAttention(dataset.n_out,
                      attn_heads=1,
                      concat_heads=True,
                      dropout_rate=0.6,
                      activation='softmax',
                      kernel_regularizer=l2(5e-4),
                      attn_kernel_regularizer=l2(5e-4),
-                     bias_regularizer=l2(5e-4))([X_2, fltr_in])
+                     bias_regularizer=l2(5e-4))([x_2, a_in])
 
 # Build model
-model = Model(inputs=[X_in, fltr_in], outputs=X_2)
+model = Model(inputs=[x_in, a_in], outputs=x_2)
 optimizer = Adam(lr=5e-3)
 loss_fn = CategoricalCrossentropy()
 acc_fn = CategoricalAccuracy()
@@ -56,8 +57,8 @@
 @tf.function
 def train():
     with tf.GradientTape() as tape:
-        predictions = model([X, fltr], training=True)
-        loss = loss_fn(y[train_mask], predictions[train_mask])
+        predictions = model([x, a], training=True)
+        loss = loss_fn(y[mask_tr], predictions[mask_tr])
         loss += sum(model.losses)
     gradients = tape.gradient(loss, model.trainable_variables)
     optimizer.apply_gradients(zip(gradients, model.trainable_variables))
@@ -66,10 +67,10 @@ def train():
 
 @tf.function
 def evaluate():
-    predictions = model([X, fltr], training=False)
+    predictions = model([x, a], training=False)
     losses = []
     accuracies = []
-    for mask in [train_mask, val_mask, test_mask]:
+    for mask in [mask_tr, mask_va, mask_te]:
         loss = loss_fn(y[mask], predictions[mask])
         loss += sum(model.losses)
         losses.append(loss)
diff --git a/examples/node_prediction/citation_gcn_fast.py b/examples/node_prediction/citation_gcn_fast.py
index b6c3e8ba..ba5780c8 100644
--- a/examples/node_prediction/citation_gcn_fast.py
+++ b/examples/node_prediction/citation_gcn_fast.py
@@ -12,25 +12,27 @@
 from tensorflow.keras.optimizers import Adam
 from tensorflow.keras.regularizers import l2
 
-from spektral.datasets import citation
-from spektral.layers import GraphConv, ops
+from spektral.datasets.citation import Citation
+from spektral.layers import GraphConv
+from spektral.transforms import LayerPreprocess, AdjToSpTensor
 from spektral.utils import tic, toc
 
 # Load data
-A, X, y, train_mask, val_mask, test_mask = citation.load_data('cora')
-fltr = GraphConv.preprocess(A).astype('f4')
-fltr = ops.sp_matrix_to_sp_tensor(fltr)
-X = X.toarray()
+dataset = Citation('cora',
+                   transforms=[LayerPreprocess(GraphConv), AdjToSpTensor()])
+graph = dataset[0]
+x, a, y = graph.x, graph.adj, graph.y
+mask_tr, mask_va, mask_te = dataset.mask_tr, dataset.mask_va, dataset.mask_te
 
 # Define model
-X_in = Input(shape=(X.shape[1],))
-fltr_in = Input((X.shape[0],), sparse=True)
-X_1 = GraphConv(16, 'relu', True, kernel_regularizer=l2(5e-4))([X_in, fltr_in])
-X_1 = Dropout(0.5)(X_1)
-X_2 = GraphConv(y.shape[1], 'softmax', True)([X_1, fltr_in])
+x_in = Input(shape=(dataset.F,))
+a_in = Input((dataset.F,), sparse=True)
+x_1 = GraphConv(16, 'relu', True, kernel_regularizer=l2(5e-4))([x_in, a_in])
+x_1 = Dropout(0.5)(x_1)
+x_2 = GraphConv(y.shape[1], 'softmax', True)([x_1, a_in])
 
 # Build model
-model = Model(inputs=[X_in, fltr_in], outputs=X_2)
+model = Model(inputs=[x_in, a_in], outputs=x_2)
 optimizer = Adam(lr=1e-2)
 loss_fn = CategoricalCrossentropy()
 
@@ -39,8 +41,8 @@
 @tf.function
 def train():
     with tf.GradientTape() as tape:
-        predictions = model([X, fltr], training=True)
-        loss = loss_fn(y[train_mask], predictions[train_mask])
+        predictions = model([x, a], training=True)
+        loss = loss_fn(y[mask_tr], predictions[mask_tr])
         loss += sum(model.losses)
     gradients = tape.gradient(loss, model.trainable_variables)
     optimizer.apply_gradients(zip(gradients, model.trainable_variables))

From 8d45fdb0eeece6f1cb10301498d1da1248373113 Mon Sep 17 00:00:00 2001
From: Daniele Grattarola <daniele.grattarola@gmail.com>
Date: Wed, 4 Nov 2020 12:53:39 +0100
Subject: [PATCH 17/57] Add aliases for citation datasets

---
 examples/node_prediction/citation_gat_fast.py |  5 ++--
 examples/node_prediction/citation_gcn_fast.py |  8 ++---
 spektral/datasets/citation.py                 | 30 ++++++++++++++++++-
 3 files changed, 34 insertions(+), 9 deletions(-)

diff --git a/examples/node_prediction/citation_gat_fast.py b/examples/node_prediction/citation_gat_fast.py
index 5d8692c0..b9021bf4 100644
--- a/examples/node_prediction/citation_gat_fast.py
+++ b/examples/node_prediction/citation_gat_fast.py
@@ -12,14 +12,13 @@
 from tensorflow.keras.optimizers import Adam
 from tensorflow.keras.regularizers import l2
 
-from spektral.datasets.citation import Citation
+from spektral.datasets.citation import Cora
 from spektral.layers import GraphAttention
 from spektral.transforms import LayerPreprocess, AdjToSpTensor
 from spektral.utils import tic, toc
 
 # Load data
-dataset = Citation('cora',
-                   transforms=[LayerPreprocess(GraphAttention), AdjToSpTensor()])
+dataset = Cora(transforms=[LayerPreprocess(GraphAttention), AdjToSpTensor()])
 graph = dataset[0]
 x, a, y = graph.x, graph.adj, graph.y
 mask_tr, mask_va, mask_te = dataset.mask_tr, dataset.mask_va, dataset.mask_te
diff --git a/examples/node_prediction/citation_gcn_fast.py b/examples/node_prediction/citation_gcn_fast.py
index ba5780c8..b64047be 100644
--- a/examples/node_prediction/citation_gcn_fast.py
+++ b/examples/node_prediction/citation_gcn_fast.py
@@ -2,8 +2,7 @@
 This script is a proof of concept to train GCN as fast as possible and with as
 little lines of code as possible.
 It uses a custom training function instead of the standard Keras fit(), and
-can train GCN for 200 epochs in a few tenths of a second (0.32s on a GTX 1050).
-In total, this script has 34 SLOC.
+can train GCN for 200 epochs in a few tenths of a second (~0.20 on a GTX 1050).
 """
 import tensorflow as tf
 from tensorflow.keras.layers import Input, Dropout
@@ -12,14 +11,13 @@
 from tensorflow.keras.optimizers import Adam
 from tensorflow.keras.regularizers import l2
 
-from spektral.datasets.citation import Citation
+from spektral.datasets.citation import Cora
 from spektral.layers import GraphConv
 from spektral.transforms import LayerPreprocess, AdjToSpTensor
 from spektral.utils import tic, toc
 
 # Load data
-dataset = Citation('cora',
-                   transforms=[LayerPreprocess(GraphConv), AdjToSpTensor()])
+dataset = Cora(transforms=[LayerPreprocess(GraphConv), AdjToSpTensor()])
 graph = dataset[0]
 x, a, y = graph.x, graph.adj, graph.y
 mask_tr, mask_va, mask_te = dataset.mask_tr, dataset.mask_va, dataset.mask_te
diff --git a/spektral/datasets/citation.py b/spektral/datasets/citation.py
index c9500cae..14f1806e 100644
--- a/spektral/datasets/citation.py
+++ b/spektral/datasets/citation.py
@@ -7,6 +7,7 @@
 import scipy.sparse as sp
 
 from spektral.data import Dataset, Graph
+from spektral.datasets.utils import DATASET_FOLDER
 from spektral.utils.io import load_binary
 
 
@@ -47,7 +48,7 @@ def __init__(self, name, random_split=False, normalize_x=False, **kwargs):
 
     @property
     def path(self):
-        return osp.join(super(Citation, self).path, self.name)
+        return osp.join(DATASET_FOLDER, 'Citation', self.name)
 
     def read(self):
         objects = [_read_file(self.path, self.name, s) for s in self.suffixes]
@@ -119,6 +120,33 @@ def available_datasets():
         return ['cora', 'citeseer', 'pubmed']
 
 
+class Cora(Citation):
+    """
+    Alias for `Citation('cora')`.
+    """
+    def __init__(self, random_split=False, normalize_x=False, **kwargs):
+        super().__init__('cora', random_split=random_split,
+                         normalize_x=normalize_x, **kwargs)
+
+
+class Citeseer(Citation):
+    """
+    Alias for `Citation('citeseer')`.
+    """
+    def __init__(self, random_split=False, normalize_x=False, **kwargs):
+        super().__init__('citeseer', random_split=random_split,
+                         normalize_x=normalize_x, **kwargs)
+
+
+class Pubmed(Citation):
+    """
+    Alias for `Citation('pubmed')`.
+    """
+    def __init__(self, random_split=False, normalize_x=False, **kwargs):
+        super().__init__('pubmed', random_split=random_split,
+                         normalize_x=normalize_x, **kwargs)
+
+
 def _read_file(path, name, suffix):
     full_fname = os.path.join(path, 'ind.{}.{}'.format(name, suffix))
     if suffix == 'test.index':

From 12de0e9a152a52fe219439d55865f3dc5d1f7e80 Mon Sep 17 00:00:00 2001
From: Daniele Grattarola <daniele.grattarola@gmail.com>
Date: Wed, 4 Nov 2020 12:55:17 +0100
Subject: [PATCH 18/57] Update tests

---
 tests/test_datasets.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index 723b7f5f..69a6b855 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -5,9 +5,9 @@
 
 
 def test_citation():
-    dataset = citation.Citation('cora')
-    dataset = citation.Citation('citeseer', random_split=True)
-    dataset = citation.Citation('pubmed', normalize_x=True)
+    dataset = citation.Cora()
+    dataset = citation.Citeseer(random_split=True)
+    dataset = citation.Pubmed(normalize_x=True)
 
 
 def test_graphsage():

From 14e33aa8f81e12e49e1acff6794c4387e5df70d6 Mon Sep 17 00:00:00 2001
From: Daniele Grattarola <daniele.grattarola@gmail.com>
Date: Wed, 4 Nov 2020 13:11:28 +0100
Subject: [PATCH 19/57] Fix issue in citation_gat_fast.py

---
 examples/node_prediction/citation_gat_fast.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/node_prediction/citation_gat_fast.py b/examples/node_prediction/citation_gat_fast.py
index b9021bf4..6642efb3 100644
--- a/examples/node_prediction/citation_gat_fast.py
+++ b/examples/node_prediction/citation_gat_fast.py
@@ -83,7 +83,7 @@ def evaluate():
 current_patience = patience = 100
 epochs = 999999
 tic()
-for epoch in range(1, epochs):
+for epoch in range(1, epochs + 1):
     train()
     l, a = evaluate()
     print('Loss tr: {:.4f}, Acc tr: {:.4f}, '

From 46d972472076ace339cd8e60f6cfd293bf55ab24 Mon Sep 17 00:00:00 2001
From: Daniele Grattarola <daniele.grattarola@gmail.com>
Date: Wed, 4 Nov 2020 13:26:45 +0100
Subject: [PATCH 20/57] Fix node_clustering_mincut.py Run all examples in GH
 workflow

---
 .github/workflows/examples.yml           | 18 ++++++------
 examples/other/node_clustering_mincut.py | 37 ++++++++++++------------
 spektral/data/dataset.py                 |  6 ++--
 3 files changed, 31 insertions(+), 30 deletions(-)

diff --git a/.github/workflows/examples.yml b/.github/workflows/examples.yml
index 64f78d2e..ec6ddbd0 100644
--- a/.github/workflows/examples.yml
+++ b/.github/workflows/examples.yml
@@ -36,20 +36,20 @@ jobs:
     - name: Run all examples
       run: |
         cd examples/node_prediction/
-        for c_py in citation_*.py; do
-          echo "##### $c_py #####"
-          python $c_py
+        for f in *.py; do
+          echo "##### $f #####"
+          python $f
         done
         cd ..
         cd graph_prediction/
-        for c_py in qm9_*.py; do
-          echo "##### $c_py #####"
-          python $c_py
+        for f in *.py; do
+          echo "##### $f #####"
+          python $f
         done
         cd ..
         cd other/
-        for c_py in *.py; do
-          echo "##### $c_py #####"
-          python $c_py
+        for f in *.py; do
+          echo "##### $f #####"
+          python $f
         done
         cd ..
diff --git a/examples/other/node_clustering_mincut.py b/examples/other/node_clustering_mincut.py
index 761fc618..a259504b 100644
--- a/examples/other/node_clustering_mincut.py
+++ b/examples/other/node_clustering_mincut.py
@@ -14,7 +14,7 @@
 from tensorflow.keras.models import Model
 from tqdm import tqdm
 
-from spektral.datasets import citation
+from spektral.datasets.citation import Cora
 from spektral.layers.convolutional import GraphConvSkip
 from spektral.layers.ops import sp_matrix_to_sp_tensor
 from spektral.layers.pooling import MinCutPool
@@ -38,29 +38,30 @@ def train_step(inputs):
 ################################################################################
 # LOAD DATASET
 ################################################################################
-A, X, y, _, _, _ = citation.load_data('cora')
-A_norm = normalized_adjacency(A)
-X = X.todense()
-F = X.shape[-1]
+dataset = Cora()
+adj, x, y = dataset[0].adj, dataset[0].x, dataset[0].y
+a_norm = normalized_adjacency(adj)
+a_norm = sp_matrix_to_sp_tensor(a_norm)
+F = dataset.F
 y = np.argmax(y, axis=-1)
 n_clusters = y.max() + 1
 
 ################################################################################
 # MODEL
 ################################################################################
-X_in = Input(shape=(F,), name='X_in')
-A_in = Input(shape=(None, ), name='A_in', sparse=True)
+x_in = Input(shape=(F,), name='X_in')
+a_in = Input(shape=(None,), name='A_in', sparse=True)
 
-X_1 = GraphConvSkip(16, activation='elu')([X_in, A_in])
-X_1, A_1, S = MinCutPool(n_clusters, return_mask=True)([X_1, A_in])
+x_1 = GraphConvSkip(16, activation='elu')([x_in, a_in])
+x_1, a_1, s_1 = MinCutPool(n_clusters, return_mask=True)([x_1, a_in])
 
-model = Model([X_in, A_in], [X_1, S])
+model = Model([x_in, a_in], [x_1, s_1])
 
 ################################################################################
 # TRAINING
 ################################################################################
 # Setup
-inputs = [X, sp_matrix_to_sp_tensor(A_norm)]
+inputs = [x, a_norm]
 opt = tf.keras.optimizers.Adam(learning_rate=lr)
 
 # Fit model
@@ -70,18 +71,18 @@ def train_step(inputs):
     outs = train_step(inputs)
     outs = [o.numpy() for o in outs]
     loss_history.append((outs[0], outs[1], (outs[0] + outs[1])))
-    s = np.argmax(outs[2], axis=-1)
-    nmi_history.append(v_measure_score(y, s))
+    s_out = np.argmax(outs[2], axis=-1)
+    nmi_history.append(v_measure_score(y, s_out))
 loss_history = np.array(loss_history)
 
 ################################################################################
 # RESULTS
 ################################################################################
-_, S_ = model(inputs, training=False)
-s = np.argmax(S_, axis=-1)
-hom = homogeneity_score(y, s)
-com = completeness_score(y, s)
-nmi = v_measure_score(y, s)
+_, s_out = model(inputs, training=False)
+s_out = np.argmax(s_out, axis=-1)
+hom = homogeneity_score(y, s_out)
+com = completeness_score(y, s_out)
+nmi = v_measure_score(y, s_out)
 print('Homogeneity: {:.3f}; Completeness: {:.3f}; NMI: {:.3f}'.format(hom, com, nmi))
 
 # Plots
diff --git a/spektral/data/dataset.py b/spektral/data/dataset.py
index b609761c..1ea98b61 100644
--- a/spektral/data/dataset.py
+++ b/spektral/data/dataset.py
@@ -36,7 +36,7 @@ def read(self):
 
     ```
     >>> dataset.F
-    >>> dataset.S
+    >>> dataset.s_1
     >>> dataset.n_out
     ```
 
@@ -115,9 +115,9 @@ def _signature(self):
         if graph.edge_attr is not None:
             signature['e'] = dict()
             signature['e']['spec'] = get_spec(graph.edge_attr)
-            signature['e']['shape'] = (None, graph.S)
+            signature['e']['shape'] = (None, graph.s_1)
             signature['e']['dtype'] = tf.as_dtype(graph.edge_attr.dtype)
-            self.S = graph.S
+            self.S = graph.s_1
         if graph.y is not None:
             signature['y'] = dict()
             signature['y']['spec'] = get_spec(graph.y)

From 1f8dc56bfe3700700b05b4cd18549306324b92e0 Mon Sep 17 00:00:00 2001
From: Daniele Grattarola <daniele.grattarola@gmail.com>
Date: Wed, 4 Nov 2020 14:26:38 +0100
Subject: [PATCH 21/57] Fix OGB loader for single-mode case Update
 ogbn-arxiv_gcn.py example Remove any explicit access to .row/.col attributes
 of sparse COO matrices in favor of sp.find

---
 examples/node_prediction/citation_gcn.py   |  6 +-
 examples/node_prediction/ogbn-arxiv_gcn.py | 90 ++++++++++------------
 spektral/data/dataset.py                   |  6 +-
 spektral/data/utils.py                     |  2 +-
 spektral/datasets/ogb.py                   |  6 +-
 spektral/layers/ops/sparse.py              | 18 ++---
 6 files changed, 60 insertions(+), 68 deletions(-)

diff --git a/examples/node_prediction/citation_gcn.py b/examples/node_prediction/citation_gcn.py
index ae358f28..75eaa1d6 100644
--- a/examples/node_prediction/citation_gcn.py
+++ b/examples/node_prediction/citation_gcn.py
@@ -23,14 +23,14 @@
 
 # Parameters
 channels = 16          # Number of channels in the first layer
-N = dataset.N          # Number of nodes in the graph
-F = dataset.F          # Original size of node features
-n_out = dataset.n_out  # Number of classes
 dropout = 0.5          # Dropout rate for the features
 l2_reg = 5e-4 / 2      # L2 regularization rate
 learning_rate = 1e-2   # Learning rate
 epochs = 200           # Number of training epochs
 patience = 10          # Patience for early stopping
+N = dataset.N          # Number of nodes in the graph
+F = dataset.F          # Original size of node features
+n_out = dataset.n_out  # Number of classes
 
 # Model definition
 X_in = Input(shape=(F, ))
diff --git a/examples/node_prediction/ogbn-arxiv_gcn.py b/examples/node_prediction/ogbn-arxiv_gcn.py
index 1e4c914f..4d3db382 100644
--- a/examples/node_prediction/ogbn-arxiv_gcn.py
+++ b/examples/node_prediction/ogbn-arxiv_gcn.py
@@ -10,54 +10,37 @@
 from tensorflow.keras.losses import SparseCategoricalCrossentropy
 from tensorflow.keras.models import Model
 from tensorflow.keras.optimizers import Adam
-from tqdm import tqdm
 
-from spektral.datasets import ogb
+from spektral.datasets.ogb import OGB
 from spektral.layers import GraphConv
-
-
-def evaluate(X, fltr, y, model, masks, evaluator):
-    p = model.predict_on_batch([X, fltr])
-    p = p.argmax(-1)[:, None]
-    tr_mask, va_mask, te_mask = masks
-    tr_auc = evaluator.eval({'y_true': y[tr_mask],
-                             'y_pred': p[tr_mask]})['acc']
-    va_auc = evaluator.eval({'y_true': y[va_mask],
-                             'y_pred': p[va_mask]})['acc']
-    te_auc = evaluator.eval({'y_true': y[te_mask],
-                             'y_pred': p[te_mask]})['acc']
-    return tr_auc, va_auc, te_auc
-
+from spektral.transforms import GCNFilter, AdjToSpTensor
 
 # Load data
 dataset_name = 'ogbn-arxiv'
-dataset = NodePropPredDataset(dataset_name)
-evaluator = Evaluator(dataset_name)
-graph, y = dataset[0]
-X, A, _ = ogb.graph_to_numpy(graph)
-N = A.shape[0]
+ogb_dataset = NodePropPredDataset(dataset_name)
+dataset = OGB(ogb_dataset, transforms=[GCNFilter(), AdjToSpTensor()])
+graph = dataset[0]
+x, adj, y = graph.x, graph.adj, graph.y
+N = dataset.N
 
 # Data splits
-idxs = dataset.get_idx_split()
-tr_idx, va_idx, te_idx = idxs["train"], idxs["valid"], idxs["test"]
-tr_mask = np.zeros(N, dtype=bool)
-tr_mask[tr_idx] = True
-va_mask = np.zeros(N, dtype=bool)
-va_mask[va_idx] = True
-te_mask = np.zeros(N, dtype=bool)
-te_mask[te_idx] = True
-masks = [tr_mask, va_mask, te_mask]
+idx = ogb_dataset.get_idx_split()
+idx_tr, idx_va, idx_te = idx["train"], idx["valid"], idx["test"]
+mask_tr = np.zeros(N, dtype=bool)
+mask_va = np.zeros(N, dtype=bool)
+mask_te = np.zeros(N, dtype=bool)
+mask_tr[idx_tr] = True
+mask_va[idx_va] = True
+mask_te[idx_te] = True
+masks = [mask_tr, mask_va, mask_te]
 
 # Parameters
 channels = 256
-learning_rate = 1e-2
-dropout = 0.5
-epochs = 200
-F = X.shape[1]
-n_classes = dataset.num_classes
-
-# Preprocessing operations
-fltr = GraphConv.preprocess(A).astype('f4')
+dropout = 0.5                    # Dropout rate for the features
+learning_rate = 1e-2             # Learning rate
+epochs = 200                     # Number of training epochs
+F = dataset.F                    # Original size of node features
+n_out = ogb_dataset.num_classes  # OGB labels are sparse indices
 
 # Model definition
 X_in = Input(shape=(F, ))
@@ -68,7 +51,7 @@ def evaluate(X, fltr, y, model, masks, evaluator):
 X_2 = GraphConv(channels, activation='relu')([X_1, fltr_in])
 X_2 = BatchNormalization()(X_2)
 X_2 = Dropout(dropout)(X_2)
-X_3 = GraphConv(n_classes, activation='softmax')([X_2, fltr_in])
+X_3 = GraphConv(n_out, activation='softmax')([X_2, fltr_in])
 
 # Build model
 model = Model(inputs=[X_in, fltr_in], outputs=X_3)
@@ -76,17 +59,28 @@ def evaluate(X, fltr, y, model, masks, evaluator):
 model.compile(optimizer=optimizer, loss=SparseCategoricalCrossentropy())
 model.summary()
 
+
+# Evaluation with OGB
+evaluator = Evaluator(dataset_name)
+def evaluate(X, fltr, y, model, masks, evaluator):
+    p = model.predict_on_batch([X, fltr])
+    p = p.argmax(-1)[:, None]
+    tr_mask, va_mask, te_mask = masks
+    tr_auc = evaluator.eval({'y_true': y[tr_mask], 'y_pred': p[tr_mask]})['acc']
+    va_auc = evaluator.eval({'y_true': y[va_mask], 'y_pred': p[va_mask]})['acc']
+    te_auc = evaluator.eval({'y_true': y[te_mask], 'y_pred': p[te_mask]})['acc']
+    return tr_auc, va_auc, te_auc
+
+
 # Train model
-for i in tqdm(range(1, 1 + epochs)):
-    tr_loss = model.train_on_batch([X, fltr], y, sample_weight=tr_mask)
-    tr_auc, va_auc, te_auc = evaluate(X, fltr, y, model, masks, evaluator)
-    tqdm.write(
-        'Ep. {} - Loss: {:.3f} - Acc: {:.3f} - Val acc: {:.3f} - Test acc: {:.3f}'
-        .format(i, tr_loss, tr_auc, va_auc, te_auc)
-    )
+for i in range(1, 1 + epochs):
+    tr_loss = model.train_on_batch([x, adj], y, sample_weight=mask_tr)
+    tr_auc, va_auc, te_auc = evaluate(x, adj, y, model, masks, evaluator)
+    print('Ep. {} - Loss: {:.3f} - Acc: {:.3f} - Val acc: {:.3f} - Test acc: '
+          '{:.3f}'.format(i, tr_loss, tr_auc, va_auc, te_auc))
 
 # Evaluate model
 print('Evaluating model.')
-te_loss = model.test_on_batch([X, fltr], y, sample_weight=te_mask)
-tr_auc, va_auc, te_auc = evaluate(X, fltr, y, model, masks, evaluator)
+te_loss = model.test_on_batch([x, adj], y, sample_weight=mask_te)
+tr_auc, va_auc, te_auc = evaluate(x, adj, y, model, masks, evaluator)
 print('Done! Loss: {:.2f} - Test acc: {:.3f}'.format(te_loss, te_auc))
diff --git a/spektral/data/dataset.py b/spektral/data/dataset.py
index 1ea98b61..b609761c 100644
--- a/spektral/data/dataset.py
+++ b/spektral/data/dataset.py
@@ -36,7 +36,7 @@ def read(self):
 
     ```
     >>> dataset.F
-    >>> dataset.s_1
+    >>> dataset.S
     >>> dataset.n_out
     ```
 
@@ -115,9 +115,9 @@ def _signature(self):
         if graph.edge_attr is not None:
             signature['e'] = dict()
             signature['e']['spec'] = get_spec(graph.edge_attr)
-            signature['e']['shape'] = (None, graph.s_1)
+            signature['e']['shape'] = (None, graph.S)
             signature['e']['dtype'] = tf.as_dtype(graph.edge_attr.dtype)
-            self.S = graph.s_1
+            self.S = graph.S
         if graph.y is not None:
             signature['y'] = dict()
             signature['y']['spec'] = get_spec(graph.y)
diff --git a/spektral/data/utils.py b/spektral/data/utils.py
index f2cab63c..ebff22b1 100644
--- a/spektral/data/utils.py
+++ b/spektral/data/utils.py
@@ -57,7 +57,7 @@ def to_disjoint(x_list, a_list, e_list=None):
     # Edge attributes
     if e_list is not None:
         if e_list[0].ndim == 3:  # Convert dense to sparse
-            e_list = [e[a.row, a.col] for e, a in zip(e_list, a_list)]
+            e_list = [e[sp.find(a)[:-1]] for e, a in zip(e_list, a_list)]
         e_out = np.vstack(e_list)
         return x_out, a_out, e_out, i_out
     else:
diff --git a/spektral/datasets/ogb.py b/spektral/datasets/ogb.py
index 73669885..210c4ece 100644
--- a/spektral/datasets/ogb.py
+++ b/spektral/datasets/ogb.py
@@ -18,7 +18,11 @@ def __init__(self, dataset, **kwargs):
         super().__init__(**kwargs)
 
     def read(self):
-        return [Graph(*_elem_to_numpy(elem)) for elem in self.dataset]
+        if len(self.dataset) > 1:
+            return [Graph(*_elem_to_numpy(elem)) for elem in self.dataset]
+        else:
+            # OGB crashed if we try to iterate over a NodePropPredDataset
+            return [Graph(*_elem_to_numpy(self.dataset[0]))]
 
 
 def _elem_to_numpy(elem):
diff --git a/spektral/layers/ops/sparse.py b/spektral/layers/ops/sparse.py
index 0cef053e..a5e24c30 100644
--- a/spektral/layers/ops/sparse.py
+++ b/spektral/layers/ops/sparse.py
@@ -10,16 +10,12 @@ def sp_matrix_to_sp_tensor(x):
     :param x: a Scipy sparse matrix.
     :return: a SparseTensor.
     """
-    if not hasattr(x, 'tocoo'):
-        try:
-            x = sp.coo_matrix(x)
-        except:
-            raise TypeError('x must be convertible to scipy.coo_matrix')
-    else:
-        x = x.tocoo()
+    if len(x.shape) != 2:
+        raise ValueError('x must have rank 2')
+    row, col, values = sp.find(x)
     out = tf.SparseTensor(
-        indices=np.array([x.row, x.col]).T,
-        values=x.data,
+        indices=np.array([row, col]).T,
+        values=values,
         dense_shape=x.shape
     )
     return tf.sparse.reorder(out)
@@ -33,9 +29,7 @@ def sp_batch_to_sp_tensor(a_list):
     """
     tensor_data = []
     for i, a in enumerate(a_list):
-        values = a.tocoo().data
-        row = a.row
-        col = a.col
+        row, col, values = sp.find(a)
         batch = np.ones_like(col) * i
         tensor_data.append((values, batch, row, col))
     tensor_data = list(map(np.concatenate, zip(*tensor_data)))

From 4bc4e5b1b72f9acbf0144b932c58af3c8bc22c26 Mon Sep 17 00:00:00 2001
From: Daniele Grattarola <daniele.grattarola@gmail.com>
Date: Wed, 4 Nov 2020 14:30:25 +0100
Subject: [PATCH 22/57] Clean up examples

---
 examples/node_prediction/citation_arma.py     |  9 +++++----
 examples/node_prediction/citation_cheby.py    |  7 ++++---
 examples/node_prediction/citation_gat.py      |  6 +++---
 examples/node_prediction/citation_gcn.py      |  1 +
 .../node_prediction/citation_simple_gc.py     |  7 ++++---
 examples/node_prediction/ogbn-arxiv_gcn.py    | 19 ++++++++++---------
 6 files changed, 27 insertions(+), 22 deletions(-)

diff --git a/examples/node_prediction/citation_arma.py b/examples/node_prediction/citation_arma.py
index 07561e66..3f1612cc 100644
--- a/examples/node_prediction/citation_arma.py
+++ b/examples/node_prediction/citation_arma.py
@@ -26,16 +26,17 @@
 iterations = 1         # Number of iterations to approximate each ARMA(1)
 order = 2              # Order of the ARMA filter (number of parallel stacks)
 share_weights = True   # Share weights in each ARMA stack
-N = dataset.N          # Number of nodes in the graph
-F = dataset.F          # Original size of node features
-n_out = dataset.n_out  # Number of classes
-dropout = 0.5          # Dropout rate for the features
 dropout_skip = 0.75    # Dropout rate for the internal skip connection of ARMA
+dropout = 0.5          # Dropout rate for the features
 l2_reg = 5e-5          # L2 regularization rate
 learning_rate = 1e-2   # Learning rate
 epochs = 20000         # Number of training epochs
 patience = 100         # Patience for early stopping
 
+N = dataset.N          # Number of nodes in the graph
+F = dataset.F          # Original size of node features
+n_out = dataset.n_out  # Number of classes
+
 # Model definition
 X_in = Input(shape=(F, ))
 fltr_in = Input((N, ), sparse=True)
diff --git a/examples/node_prediction/citation_cheby.py b/examples/node_prediction/citation_cheby.py
index f814289e..ff99e1b5 100644
--- a/examples/node_prediction/citation_cheby.py
+++ b/examples/node_prediction/citation_cheby.py
@@ -25,15 +25,16 @@
 # Parameters
 channels = 16          # Number of channels in the first layer
 K = 2                  # Max degree of the Chebyshev polynomials
-N = dataset.N          # Number of nodes in the graph
-F = dataset.F          # Original size of node features
-n_out = dataset.n_out  # Number of classes
 dropout = 0.5          # Dropout rate for the features
 l2_reg = 5e-4 / 2      # L2 regularization rate
 learning_rate = 1e-2   # Learning rate
 epochs = 200           # Number of training epochs
 patience = 10          # Patience for early stopping
 
+N = dataset.N          # Number of nodes in the graph
+F = dataset.F          # Original size of node features
+n_out = dataset.n_out  # Number of classes
+
 # Model definition
 X_in = Input(shape=(F, ))
 fltr_in = Input((N, ), sparse=True)
diff --git a/examples/node_prediction/citation_gat.py b/examples/node_prediction/citation_gat.py
index 6eeeddf1..a59458e7 100644
--- a/examples/node_prediction/citation_gat.py
+++ b/examples/node_prediction/citation_gat.py
@@ -24,15 +24,15 @@
 # Parameters
 channels = 8           # Number of channels in each head of the first GAT layer
 n_attn_heads = 8       # Number of attention heads in first GAT layer
-N = dataset.N          # Number of nodes in the graph
-F = dataset.F          # Original size of node features
-n_out = dataset.n_out  # Number of classes
 dropout = 0.6          # Dropout rate for the features and adjacency matrix
 l2_reg = 5e-6          # L2 regularization rate
 learning_rate = 5e-3   # Learning rate
 epochs = 20000         # Number of training epochs
 patience = 100         # Patience for early stopping
 
+N = dataset.N          # Number of nodes in the graph
+F = dataset.F          # Original size of node features
+n_out = dataset.n_out  # Number of classes
 
 # Model definition
 X_in = Input(shape=(F, ))
diff --git a/examples/node_prediction/citation_gcn.py b/examples/node_prediction/citation_gcn.py
index 75eaa1d6..55b6c5ed 100644
--- a/examples/node_prediction/citation_gcn.py
+++ b/examples/node_prediction/citation_gcn.py
@@ -28,6 +28,7 @@
 learning_rate = 1e-2   # Learning rate
 epochs = 200           # Number of training epochs
 patience = 10          # Patience for early stopping
+
 N = dataset.N          # Number of nodes in the graph
 F = dataset.F          # Original size of node features
 n_out = dataset.n_out  # Number of classes
diff --git a/examples/node_prediction/citation_simple_gc.py b/examples/node_prediction/citation_simple_gc.py
index 6e85afb7..ea5fc03d 100644
--- a/examples/node_prediction/citation_simple_gc.py
+++ b/examples/node_prediction/citation_simple_gc.py
@@ -41,14 +41,15 @@ def __call__(self, graph):
 mask_tr, mask_va, mask_te = dataset.mask_tr, dataset.mask_va, dataset.mask_te
 
 # Parameters
-N = dataset.N          # Number of nodes in the graph
-F = dataset.F          # Original size of node features
-n_out = dataset.n_out  # Number of classes
 l2_reg = 5e-6          # L2 regularization rate
 learning_rate = 0.2    # Learning rate
 epochs = 20000         # Number of training epochs
 patience = 200         # Patience for early stopping
 
+N = dataset.N          # Number of nodes in the graph
+F = dataset.F          # Original size of node features
+n_out = dataset.n_out  # Number of classes
+
 # Model definition
 X_in = Input(shape=(F, ))
 fltr_in = Input((N, ), sparse=True)
diff --git a/examples/node_prediction/ogbn-arxiv_gcn.py b/examples/node_prediction/ogbn-arxiv_gcn.py
index 4d3db382..2bbb4e70 100644
--- a/examples/node_prediction/ogbn-arxiv_gcn.py
+++ b/examples/node_prediction/ogbn-arxiv_gcn.py
@@ -21,7 +21,16 @@
 dataset = OGB(ogb_dataset, transforms=[GCNFilter(), AdjToSpTensor()])
 graph = dataset[0]
 x, adj, y = graph.x, graph.adj, graph.y
-N = dataset.N
+
+# Parameters
+channels = 256                   # Number of channels for GCN layers
+dropout = 0.5                    # Dropout rate for the features
+learning_rate = 1e-2             # Learning rate
+epochs = 200                     # Number of training epochs
+
+N = dataset.N                    # Number of nodes in the graph
+F = dataset.F                    # Original size of node features
+n_out = ogb_dataset.num_classes  # OGB labels are sparse indices
 
 # Data splits
 idx = ogb_dataset.get_idx_split()
@@ -34,14 +43,6 @@
 mask_te[idx_te] = True
 masks = [mask_tr, mask_va, mask_te]
 
-# Parameters
-channels = 256
-dropout = 0.5                    # Dropout rate for the features
-learning_rate = 1e-2             # Learning rate
-epochs = 200                     # Number of training epochs
-F = dataset.F                    # Original size of node features
-n_out = ogb_dataset.num_classes  # OGB labels are sparse indices
-
 # Model definition
 X_in = Input(shape=(F, ))
 fltr_in = Input((N, ), sparse=True)

From 2b9ff35dfe3b5bea79e4ae04bc00779df5b3e0ed Mon Sep 17 00:00:00 2001
From: Daniele Grattarola <daniele.grattarola@gmail.com>
Date: Wed, 4 Nov 2020 14:36:44 +0100
Subject: [PATCH 23/57] Fix utils.convolution.gcn_filter

---
 spektral/utils/convolution.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/spektral/utils/convolution.py b/spektral/utils/convolution.py
index c4ecd492..04e2a734 100644
--- a/spektral/utils/convolution.py
+++ b/spektral/utils/convolution.py
@@ -117,6 +117,7 @@ def gcn_filter(A, symmetric=True):
             out[i][np.diag_indices_from(out[i])] += 1
             out[i] = normalized_adjacency(out[i], symmetric=symmetric)
     else:
+        out = out.tocsr()
         out[np.diag_indices_from(out)] += 1
         out = normalized_adjacency(out, symmetric=symmetric)
 

From a99ae080c317198c4687a5d89532bc2950ea9072 Mon Sep 17 00:00:00 2001
From: Daniele Grattarola <daniele.grattarola@gmail.com>
Date: Wed, 4 Nov 2020 18:29:49 +0100
Subject: [PATCH 24/57] Minor changes to datasets

---
 .github/workflows/examples.yml |  4 ++--
 spektral/datasets/graphsage.py |  5 ++++-
 spektral/datasets/ogb.py       |  2 +-
 spektral/datasets/tudataset.py | 19 ++++++++-----------
 4 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/.github/workflows/examples.yml b/.github/workflows/examples.yml
index ec6ddbd0..9332d699 100644
--- a/.github/workflows/examples.yml
+++ b/.github/workflows/examples.yml
@@ -21,11 +21,11 @@ jobs:
       run: |
         sudo apt-get update
         sudo apt-get install -y graphviz libgraphviz-dev libcgraph6
-    - name: Python Dependencies
+    - name: Python dependencies
       run: |
         python -m pip install --upgrade pip
         pip install matplotlib lxml tqdm ogb
-    - name: Install spektral
+    - name: Install Spektral
       run: |
         pip install .
     - name: Just one epoch
diff --git a/spektral/datasets/graphsage.py b/spektral/datasets/graphsage.py
index ac2bde96..ea1a68c0 100644
--- a/spektral/datasets/graphsage.py
+++ b/spektral/datasets/graphsage.py
@@ -78,9 +78,12 @@ def read(self):
         return [Graph(x=x, adj=adj, y=y)]
 
     def download(self):
-        print('Dowloading', self.name, 'dataset.')
+        print('Downloading {} dataset.'.format(self.name))
         url = self.url.format(self.name)
         req = requests.get(url)
+        if req.status_code == 404:
+            raise ValueError('Cannot download dataset ({} returned 404).'
+                             .format(self.url))
         os.makedirs(self.path, exist_ok=True)
 
         fname = osp.join(self.path, self.name + '.zip')
diff --git a/spektral/datasets/ogb.py b/spektral/datasets/ogb.py
index 210c4ece..801d156c 100644
--- a/spektral/datasets/ogb.py
+++ b/spektral/datasets/ogb.py
@@ -30,7 +30,7 @@ def _elem_to_numpy(elem):
     n = graph['num_nodes']
     x = graph['node_feat']
     row, col = graph['edge_index']
-    a = sp.coo_matrix((np.ones_like(row), (row, col)), shape=(n, n))
+    a = sp.coo_matrix((np.ones_like(row), (row, col)), shape=(n, n)).tocsr()
     e = graph['edge_feat']
 
     return x, a, e, label
diff --git a/spektral/datasets/tudataset.py b/spektral/datasets/tudataset.py
index 76a2d546..0ad08141 100644
--- a/spektral/datasets/tudataset.py
+++ b/spektral/datasets/tudataset.py
@@ -63,24 +63,21 @@ def path(self):
     def download(self):
         print('Downloading {} dataset{}.'
               .format(self.name, ' (clean)' if self.clean else ''))
-
         url = '{}/{}.zip'.format(self.url_clean if self.clean else self.url, self.name)
-
         req = requests.get(url)
         if req.status_code == 404:
-            raise ValueError('Unknown dataset {}. See TUD.available_datasets()'
-                             ' for a list of available datasets.'
-                             .format(self.name))
-
+            raise ValueError('Cannot download dataset ({} returned 404).'
+                             .format(self.url))
         os.makedirs(self.path, exist_ok=True)
-        ofname = osp.join(self.path, '{}.zip'.format(self.name))
-        with open(ofname, 'wb') as of:
+
+        fname = osp.join(self.path, self.name + '.zip')
+        with open(fname, 'wb') as of:
             of.write(req.content)
-        with zipfile.ZipFile(ofname, 'r') as of:
+        with zipfile.ZipFile(fname, 'r') as of:
             of.extractall(self.path)
-        os.remove(ofname)
+        os.remove(fname)
 
-        # TUD datasets are zipped in a folder: unpack them
+        # Datasets are zipped in a folder: unpack them
         parent = self.path
         subfolder = osp.join(self.path, self.name)
         for filename in os.listdir(subfolder):

From 63c749ed73e2c851cdcc7fe3241b826f2842a882 Mon Sep 17 00:00:00 2001
From: Daniele Grattarola <daniele.grattarola@gmail.com>
Date: Thu, 5 Nov 2020 08:59:57 +0100
Subject: [PATCH 25/57] Add OneHotLabels transform

---
 spektral/transforms/__init__.py |  3 ++-
 spektral/transforms/one_hot.py  | 17 +++++++++++++++++
 2 files changed, 19 insertions(+), 1 deletion(-)
 create mode 100644 spektral/transforms/one_hot.py

diff --git a/spektral/transforms/__init__.py b/spektral/transforms/__init__.py
index 4fa080fc..104ba434 100644
--- a/spektral/transforms/__init__.py
+++ b/spektral/transforms/__init__.py
@@ -2,4 +2,5 @@
 from .degree import Degree, MaxDegree
 from .gcn_filter import GCNFilter
 from .layer_preprocess import LayerPreprocess
-from .normalize_adj import NormalizeAdj
\ No newline at end of file
+from .normalize_adj import NormalizeAdj
+from .one_hot import OneHotLabels
diff --git a/spektral/transforms/one_hot.py b/spektral/transforms/one_hot.py
new file mode 100644
index 00000000..0ad172ef
--- /dev/null
+++ b/spektral/transforms/one_hot.py
@@ -0,0 +1,17 @@
+from spektral.utils import one_hot, label_to_one_hot
+
+
+class OneHotLabels(object):
+    def __init__(self, depth=None, labels=None):
+        self.depth = depth
+        self.labels = labels
+        if self.depth is None and self.labels is None:
+            raise ValueError('Must specify either depth or labels.')
+
+    def __call__(self, graph):
+        if self.labels is not None:
+            graph.y = label_to_one_hot(graph.y, self.labels)
+        else:
+            graph.y = one_hot(graph.y, self.depth)
+
+        return graph

From 75633dff60d99c7b04644c61e9b4a8acee3e3ca8 Mon Sep 17 00:00:00 2001
From: Daniele Grattarola <daniele.grattarola@gmail.com>
Date: Thu, 5 Nov 2020 16:49:32 +0100
Subject: [PATCH 26/57] Improvements to data module

---
 spektral/data/dataset.py        | 56 ++++++++++++++++++++-------------
 spektral/data/graph.py          | 36 ++++++++++++++++++---
 spektral/data/loaders.py        | 40 +++++++++++++----------
 spektral/data/utils.py          |  3 +-
 spektral/layers/ops/matmul.py   | 14 ++++-----
 spektral/utils/convolution.py   |  9 ++++--
 tests/test_data/test_dataset.py |  4 +--
 7 files changed, 108 insertions(+), 54 deletions(-)

diff --git a/spektral/data/dataset.py b/spektral/data/dataset.py
index b609761c..9a7c216f 100644
--- a/spektral/data/dataset.py
+++ b/spektral/data/dataset.py
@@ -39,10 +39,6 @@ def read(self):
     >>> dataset.S
     >>> dataset.n_out
     ```
-
-    The general shape, dtype, and `tf.TypeSpec` of the matrices composing the
-    graphs is stored in `dataset.signature`. This can be useful when
-    implementing a custom Loader for your dataset.
     """
     def __init__(self, transforms=None, **kwargs):
         if not osp.exists(self.path):
@@ -52,15 +48,6 @@ def __init__(self, transforms=None, **kwargs):
         if len(self.graphs) == 0:
             raise ValueError('Datasets cannot be empty')
 
-        if len(self.graphs) == 1 or len(set([g.N for g in self.graphs])) == 1:
-            self.N = self.graphs[0].N
-        else:
-            self.N = None
-        self.F = None
-        self.S = None
-        self.n_out = None
-        self.signature = self._signature()
-
         # Read extra kwargs
         for k, v in kwargs.items():
             setattr(self, k, v)
@@ -98,7 +85,10 @@ def map(self, transform, reduce=None):
         else:
             return out
 
-    def _signature(self):
+    def filter(self, function):
+        self.graphs = [g for g in self.graphs if function(g)]
+
+    def signature(self):
         signature = {}
         graph = self.graphs[0]  # This is always non-empty
         if graph.x is not None:
@@ -106,7 +96,6 @@ def _signature(self):
             signature['x']['spec'] = get_spec(graph.x)
             signature['x']['shape'] = (None, graph.F)
             signature['x']['dtype'] = tf.as_dtype(graph.x.dtype)
-            self.F = graph.F
         if graph.adj is not None:
             signature['a'] = dict()
             signature['a']['spec'] = get_spec(graph.adj)
@@ -117,13 +106,12 @@ def _signature(self):
             signature['e']['spec'] = get_spec(graph.edge_attr)
             signature['e']['shape'] = (None, graph.S)
             signature['e']['dtype'] = tf.as_dtype(graph.edge_attr.dtype)
-            self.S = graph.S
         if graph.y is not None:
             signature['y'] = dict()
             signature['y']['spec'] = get_spec(graph.y)
-            signature['y']['shape'] = (graph.y.shape[-1], )
-            signature['y']['dtype'] = tf.as_dtype(graph.y.dtype)
-            self.n_out = graph.y.shape[-1]
+            signature['y']['shape'] = (self.n_out, )
+            signature['y']['dtype'] = tf.as_dtype(np.array(graph.y).dtype)
+
 
         return signature
 
@@ -168,5 +156,31 @@ def __len__(self):
         return len(self.graphs)
 
     def __repr__(self):
-        return 'Dataset(len={}, signature="{}")'\
-            .format(self.__len__(), ', '.join(self.signature.keys()))
\ No newline at end of file
+        return '{}({})'.format(self.__class__.__name__, self.__len__())
+
+    @property
+    def N(self):
+        if len(self.graphs) == 1 or len(set([g.N for g in self.graphs])) == 1:
+            return self.graphs[0].N
+        else:
+            return None
+
+    @property
+    def F(self):
+        return self.graphs[0].F
+
+    @property
+    def S(self):
+        return self.graphs[0].S
+
+    @property
+    def n_out(self):
+        y = self.graphs[0].y
+        if y is None:
+            return None
+        else:
+            shp = np.shape(y)
+            if len(shp) == 0:
+                return 1
+            else:
+                return shp[-1]
diff --git a/spektral/data/graph.py b/spektral/data/graph.py
index 23cd7ff7..2f8c6c9c 100644
--- a/spektral/data/graph.py
+++ b/spektral/data/graph.py
@@ -1,3 +1,6 @@
+import numpy as np
+import scipy.sparse as sp
+
 class Graph:
     """
     A container to represent a graph with:
@@ -30,21 +33,44 @@ def __init__(self, x=None, adj=None, edge_attr=None, y=None, **kwargs):
         self.adj = adj
         self.edge_attr = edge_attr
         self.y = y
+
         # Read extra kwargs
         for k, v in kwargs.items():
             self[k] = v
 
-        self.N = None if self.x is None else self.x.shape[-2]
-        self.F = None if self.x is None else self.x.shape[-1]
-        self.S = None if self.edge_attr is None else self.edge_attr.shape[-1]
-
     def numpy(self):
         return tuple(ret for ret in [self.x, self.adj, self.edge_attr, self.y]
                      if ret is not None)
 
+    def __setitem__(self, key, value):
+        setattr(self, key, value)
+
     def __getitem__(self, key):
         return getattr(self, key, None)
 
     def __repr__(self):
         return 'Graph(N={}, F={}, S={}, y={}'\
-               .format(self.N, self.F, self.S, self.y)
\ No newline at end of file
+               .format(self.N, self.F, self.S, self.y)
+
+    @property
+    def N(self):
+        if self.x is not None:
+            return self.x.shape[-2]
+        elif self.adj is not None:
+            return self.adj.shape[-1]
+        else:
+            return None
+
+    @property
+    def F(self):
+        if self.x is not None:
+            return self.x.shape[-1]
+        else:
+            return None
+
+    @property
+    def S(self):
+        if self.edge_attr is not None:
+            return self.edge_attr.shape[-1]
+        else:
+            return None
diff --git a/spektral/data/loaders.py b/spektral/data/loaders.py
index e8ced854..0744bdce 100644
--- a/spektral/data/loaders.py
+++ b/spektral/data/loaders.py
@@ -4,7 +4,7 @@
 import tensorflow as tf
 from scipy import sparse as sp
 
-from spektral.data.utils import prepend_none, output_signature, to_disjoint, to_batch, batch_generator
+from spektral.data.utils import prepend_none, to_tf_signature, to_disjoint, to_batch, batch_generator
 from spektral.layers.ops import sp_matrix_to_sp_tensor
 
 version = tf.__version__.split('.')
@@ -28,7 +28,6 @@ def __init__(self, dataset, batch_size=1, epochs=None, shuffle=True):
         self.epochs = epochs
         self.shuffle = shuffle
         self._generator = self.generator()
-        self.steps_per_epoch = int(np.ceil(len(self.dataset) / self.batch_size))
 
     def __iter__(self):
         return self
@@ -47,9 +46,16 @@ def collate(self, batch):
     def tf(self):
         raise NotImplementedError
 
+    def tf_signature(self):
+        raise NotImplementedError
+
     def _pack(self, batch):
         return [list(elem) for elem in zip(*[g.numpy() for g in batch])]
 
+    @property
+    def steps_per_epoch(self):
+        return int(np.ceil(len(self.dataset) / self.batch_size))
+
 
 class SingleLoader(Loader):
     """
@@ -71,6 +77,9 @@ def tf(self):
         output = self.collate(self.dataset)
         return tf.data.Dataset.from_tensors(output).repeat(self.epochs)
 
+    def tf_signature(self):
+        pass
+
 
 class DisjointLoader(Loader):
     """
@@ -92,24 +101,22 @@ def tf(self):
         if not tf_loader_available:
             raise RuntimeError('Calling Loader.tf() requires TensorFlow 2.4 '
                                'or greater.')
-        signature = copy.deepcopy(self.dataset.signature)
+        return tf.data.Dataset.from_generator(
+            lambda: (_ for _ in self), output_signature=self.tf_signature())
+
+    def tf_signature(self):
+        signature = self.dataset.signature()
         if 'y' in signature:
-            # Targets have an extra None dimension in batch mode
             signature['y']['shape'] = prepend_none(signature['y']['shape'])
-
         if 'a' in signature:
-            # Adjacency matrix in batch mode is sparse
             signature['a']['spec'] = tf.SparseTensorSpec
 
         signature['i'] = dict()
         signature['i']['spec'] = tf.TensorSpec
-        signature['i']['shape'] = (None, )
+        signature['i']['shape'] = (None,)
         signature['i']['dtype'] = tf.as_dtype(tf.int64)
 
-        return tf.data.Dataset.from_generator(
-            lambda: (_ for _ in self),
-            output_signature=output_signature(signature)
-        )
+        return to_tf_signature(signature)
 
 
 class BatchLoader(Loader):
@@ -127,7 +134,11 @@ def tf(self):
         if not tf_loader_available:
             raise RuntimeError('Calling Loader.tf() requires TensorFlow 2.4 '
                                'or greater.')
-        signature = copy.deepcopy(self.dataset.signature)
+        return tf.data.Dataset.from_generator(
+            lambda: (_ for _ in self), output_signature=self.tf_signature())
+
+    def tf_signature(self):
+        signature = self.dataset.signature()
         for k in signature:
             signature[k]['shape'] = prepend_none(signature[k]['shape'])
         if 'a' in signature:
@@ -137,10 +148,7 @@ def tf(self):
             # Edge attributes have an extra None dimension in batch mode
             signature['e']['shape'] = prepend_none(signature['e']['shape'])
 
-        return tf.data.Dataset.from_generator(
-            lambda: (_ for _ in self),
-            output_signature=output_signature(signature)
-        )
+        return to_tf_signature(signature)
 
 
 class PackedBatchLoader(BatchLoader):
diff --git a/spektral/data/utils.py b/spektral/data/utils.py
index ebff22b1..3659a426 100644
--- a/spektral/data/utils.py
+++ b/spektral/data/utils.py
@@ -2,6 +2,7 @@
 import tensorflow as tf
 from scipy import sparse as sp
 
+from spektral.layers.ops import sp_batch_to_sp_tensor
 from spektral.utils import pad_jagged_array
 
 
@@ -172,7 +173,7 @@ def prepend_none(t):
     return (None, ) + t
 
 
-def output_signature(signature):
+def to_tf_signature(signature):
     output = []
     keys = ['x', 'a', 'e', 'i']
     for k in keys:
diff --git a/spektral/layers/ops/matmul.py b/spektral/layers/ops/matmul.py
index 8fe4d5ee..2a033a0e 100644
--- a/spektral/layers/ops/matmul.py
+++ b/spektral/layers/ops/matmul.py
@@ -36,15 +36,15 @@ def dot(a, b, transpose_a=False, transpose_b=False):
                 tf.sparse.sparse_dense_matmul(ops.transpose(b), ops.transpose(a))
             )
 
-    # Fallthrough to tfsp implementation
-    # Defaults to tf.matmul if neither is sparse
-    if a_is_sparse_tensor:
+    # Fallthrough to sp-sp and d-d implementations
+    if a_is_sparse_tensor and b_is_sparse_tensor:
         a = tfsp.CSRSparseMatrix(a)
-    if b_is_sparse_tensor:
         b = tfsp.CSRSparseMatrix(b)
-    out = tfsp.matmul(a, b, transpose_a=transpose_a, transpose_b=transpose_b)
-    if hasattr(out, 'to_sparse_tensor'):
-        return out.to_sparse_tensor()
+        out = tfsp.matmul(a, b, transpose_a=transpose_a, transpose_b=transpose_b)
+        if hasattr(out, 'to_sparse_tensor'):
+            return out.to_sparse_tensor()
+    else:
+        out = tf.matmul(a, b, transpose_a=transpose_a, transpose_b=transpose_b)
 
     return out
 
diff --git a/spektral/utils/convolution.py b/spektral/utils/convolution.py
index 04e2a734..8ae3a848 100644
--- a/spektral/utils/convolution.py
+++ b/spektral/utils/convolution.py
@@ -1,4 +1,5 @@
 import copy
+import warnings
 
 import numpy as np
 from scipy import sparse as sp
@@ -29,7 +30,9 @@ def degree_power(A, k):
     :return: if A is a dense array, a dense array; if A is sparse, a sparse
     matrix in DIA format.
     """
-    degrees = np.power(np.array(A.sum(1)), k).ravel()
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        degrees = np.power(np.array(A.sum(1)), k).ravel()
     degrees[np.isinf(degrees)] = 0.
     if sp.issparse(A):
         D = sp.diags(degrees)
@@ -118,7 +121,9 @@ def gcn_filter(A, symmetric=True):
             out[i] = normalized_adjacency(out[i], symmetric=symmetric)
     else:
         out = out.tocsr()
-        out[np.diag_indices_from(out)] += 1
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            out[np.diag_indices_from(out)] += 1
         out = normalized_adjacency(out, symmetric=symmetric)
 
     if sp.issparse(out):
diff --git a/tests/test_data/test_dataset.py b/tests/test_data/test_dataset.py
index 638eaf94..cddefe14 100644
--- a/tests/test_data/test_dataset.py
+++ b/tests/test_data/test_dataset.py
@@ -27,9 +27,9 @@ def test_dataset():
     assert d.S == s
     assert d.n_out == 2
 
-    # _signature
+    # signature
     for k in ['x', 'a', 'e', 'y']:
-        assert k in d.signature
+        assert k in d.signature()
 
     # __getitem__
     assert isinstance(d[0], Graph)

From eb319bb80aee24bbaaee30a07ac1265256268336 Mon Sep 17 00:00:00 2001
From: Daniele Grattarola <daniele.grattarola@gmail.com>
Date: Thu, 5 Nov 2020 18:25:38 +0100
Subject: [PATCH 27/57] Fix issue in ops.matmul

---
 spektral/data/dataset.py      | 52 +++++++++++++++++------------------
 spektral/layers/ops/matmul.py |  4 ++-
 2 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/spektral/data/dataset.py b/spektral/data/dataset.py
index 9a7c216f..bdfa436c 100644
--- a/spektral/data/dataset.py
+++ b/spektral/data/dataset.py
@@ -88,33 +88,6 @@ def map(self, transform, reduce=None):
     def filter(self, function):
         self.graphs = [g for g in self.graphs if function(g)]
 
-    def signature(self):
-        signature = {}
-        graph = self.graphs[0]  # This is always non-empty
-        if graph.x is not None:
-            signature['x'] = dict()
-            signature['x']['spec'] = get_spec(graph.x)
-            signature['x']['shape'] = (None, graph.F)
-            signature['x']['dtype'] = tf.as_dtype(graph.x.dtype)
-        if graph.adj is not None:
-            signature['a'] = dict()
-            signature['a']['spec'] = get_spec(graph.adj)
-            signature['a']['shape'] = (None, None)
-            signature['a']['dtype'] = tf.as_dtype(graph.adj.dtype)
-        if graph.edge_attr is not None:
-            signature['e'] = dict()
-            signature['e']['spec'] = get_spec(graph.edge_attr)
-            signature['e']['shape'] = (None, graph.S)
-            signature['e']['dtype'] = tf.as_dtype(graph.edge_attr.dtype)
-        if graph.y is not None:
-            signature['y'] = dict()
-            signature['y']['spec'] = get_spec(graph.y)
-            signature['y']['shape'] = (self.n_out, )
-            signature['y']['dtype'] = tf.as_dtype(np.array(graph.y).dtype)
-
-
-        return signature
-
     def __getitem__(self, key):
         if not (np.issubdtype(type(key), np.integer) or
                 isinstance(key, (slice, list, tuple, np.ndarray))):
@@ -184,3 +157,28 @@ def n_out(self):
                 return 1
             else:
                 return shp[-1]
+
+    def signature(self):
+        signature = {}
+        graph = self.graphs[0]  # This is always non-empty
+        if graph.x is not None:
+            signature['x'] = dict()
+            signature['x']['spec'] = get_spec(graph.x)
+            signature['x']['shape'] = (None, graph.F)
+            signature['x']['dtype'] = tf.as_dtype(graph.x.dtype)
+        if graph.adj is not None:
+            signature['a'] = dict()
+            signature['a']['spec'] = get_spec(graph.adj)
+            signature['a']['shape'] = (None, None)
+            signature['a']['dtype'] = tf.as_dtype(graph.adj.dtype)
+        if graph.edge_attr is not None:
+            signature['e'] = dict()
+            signature['e']['spec'] = get_spec(graph.edge_attr)
+            signature['e']['shape'] = (None, graph.S)
+            signature['e']['dtype'] = tf.as_dtype(graph.edge_attr.dtype)
+        if graph.y is not None:
+            signature['y'] = dict()
+            signature['y']['spec'] = get_spec(graph.y)
+            signature['y']['shape'] = (self.n_out, )
+            signature['y']['dtype'] = tf.as_dtype(np.array(graph.y).dtype)
+        return signature
diff --git a/spektral/layers/ops/matmul.py b/spektral/layers/ops/matmul.py
index 2a033a0e..bbea4ca8 100644
--- a/spektral/layers/ops/matmul.py
+++ b/spektral/layers/ops/matmul.py
@@ -37,9 +37,11 @@ def dot(a, b, transpose_a=False, transpose_b=False):
             )
 
     # Fallthrough to sp-sp and d-d implementations
-    if a_is_sparse_tensor and b_is_sparse_tensor:
+    if a_is_sparse_tensor:
         a = tfsp.CSRSparseMatrix(a)
+    if b_is_sparse_tensor:
         b = tfsp.CSRSparseMatrix(b)
+    if a_is_sparse_tensor or b_is_sparse_tensor:
         out = tfsp.matmul(a, b, transpose_a=transpose_a, transpose_b=transpose_b)
         if hasattr(out, 'to_sparse_tensor'):
             return out.to_sparse_tensor()

From db9afff3a4471c9a1e3015ad39d128f4bcc0b7c5 Mon Sep 17 00:00:00 2001
From: Daniele Grattarola <daniele.grattarola@gmail.com>
Date: Fri, 6 Nov 2020 12:30:35 +0100
Subject: [PATCH 28/57] Convert COOs to CSRs

---
 spektral/datasets/citation.py   | 2 +-
 spektral/datasets/graphsage.py  | 6 +++---
 spektral/datasets/ogb.py        | 2 +-
 spektral/datasets/tudataset.py  | 2 +-
 tests/test_data/test_loaders.py | 4 ++--
 tests/test_data/test_utils.py   | 2 +-
 tests/test_layers/test_base.py  | 2 +-
 tests/test_layers/test_ops.py   | 4 ++--
 8 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/spektral/datasets/citation.py b/spektral/datasets/citation.py
index 14f1806e..06c2b51d 100644
--- a/spektral/datasets/citation.py
+++ b/spektral/datasets/citation.py
@@ -92,7 +92,7 @@ def read(self):
                 idx_te, y_te, train_size=30 * n_classes, stratify=y_te)
 
         # Adjacency matrix
-        adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))
+        adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))  # CSR
         adj.setdiag(0)
         adj.eliminate_zeros()
 
diff --git a/spektral/datasets/graphsage.py b/spektral/datasets/graphsage.py
index ea1a68c0..78237a10 100644
--- a/spektral/datasets/graphsage.py
+++ b/spektral/datasets/graphsage.py
@@ -66,7 +66,7 @@ def read(self):
         npz_file = osp.join(self.path, self.name) + '.npz'
         data = np.load(npz_file)
         x = data['x']
-        adj = sp.coo_matrix(
+        adj = sp.csr_matrix(
             (data['adj_data'], (data['adj_row'], data['adj_col'])),
             shape=data['adj_shape']
         )
@@ -163,9 +163,9 @@ def preprocess_data(path, name):
              for edge in G.edges()
              if edge[0] in id_map and edge[1] in id_map]
     edges = np.array(edges, dtype=np.int32)
-    adj = sp.coo_matrix((np.ones((edges.shape[0]), dtype=np.float32),
+    adj = sp.csr_matrix((np.ones((edges.shape[0]), dtype=np.float32),
                          (edges[:, 0], edges[:, 1])), shape=(n, n))
-    adj = adj.maximum(adj.transpose()).tocoo()
+    adj = adj.maximum(adj.transpose())
 
     # Process labels
     if isinstance(list(class_map.values())[0], list):
diff --git a/spektral/datasets/ogb.py b/spektral/datasets/ogb.py
index 801d156c..42fd3b96 100644
--- a/spektral/datasets/ogb.py
+++ b/spektral/datasets/ogb.py
@@ -30,7 +30,7 @@ def _elem_to_numpy(elem):
     n = graph['num_nodes']
     x = graph['node_feat']
     row, col = graph['edge_index']
-    a = sp.coo_matrix((np.ones_like(row), (row, col)), shape=(n, n)).tocsr()
+    a = sp.csr_matrix((np.ones_like(row), (row, col)), shape=(n, n)).tocsr()
     e = graph['edge_feat']
 
     return x, a, e, label
diff --git a/spektral/datasets/tudataset.py b/spektral/datasets/tudataset.py
index 0ad08141..48443b56 100644
--- a/spektral/datasets/tudataset.py
+++ b/spektral/datasets/tudataset.py
@@ -110,7 +110,7 @@ def read(self):
         edge_lists = np.split(edges - n_nodes_cum[edge_batch_idx, None], n_edges_cum)
         # Create sparse adjacency matrices
         a_list = [
-            sp.coo_matrix(
+            sp.csr_matrix(
                 (np.ones_like(el[:, 0]), (el[:, 0], el[:, 1])),
                 shape=(n_nodes[i], n_nodes[i])
             )
diff --git a/tests/test_data/test_loaders.py b/tests/test_data/test_loaders.py
index d4576bb5..65f06eb8 100644
--- a/tests/test_data/test_loaders.py
+++ b/tests/test_data/test_loaders.py
@@ -25,7 +25,7 @@ def read(self):
         n = 10
         return [
             Graph(x=np.random.rand(n, f),
-                  adj=sp.coo_matrix(np.random.randint(0, 2, (n, n))),
+                  adj=sp.csr_matrix(np.random.randint(0, 2, (n, n))),
                   edge_attr=np.random.rand(n, n, s),
                   y=np.array(n * [[0., 1.]]))
         ]
@@ -38,7 +38,7 @@ class TestDataset(Dataset):
     def read(self):
         return [
             Graph(x=np.random.rand(n, f),
-                  adj=sp.coo_matrix(np.random.randint(0, 2, (n, n))),
+                  adj=sp.csr_matrix(np.random.randint(0, 2, (n, n))),
                   edge_attr=np.random.rand(n, n, s),
                   y=np.array([0., 1.]))
             for n in ns
diff --git a/tests/test_data/test_utils.py b/tests/test_data/test_utils.py
index d7721793..235f296f 100644
--- a/tests/test_data/test_utils.py
+++ b/tests/test_data/test_utils.py
@@ -7,7 +7,7 @@
 
 ns = np.random.randint(3, 10, 10)
 f = 3
-a_list = [sp.coo_matrix(np.ones((n, n))) for n in ns]
+a_list = [sp.csr_matrix(np.ones((n, n))) for n in ns]
 x_list = [np.random.rand(n, f) for n in ns]
 y = [[0, 1]] * len(ns)
 
diff --git a/tests/test_layers/test_base.py b/tests/test_layers/test_base.py
index 87671fc3..1ac1fd33 100644
--- a/tests/test_layers/test_base.py
+++ b/tests/test_layers/test_base.py
@@ -11,7 +11,7 @@ def test_Disjoint2Batch():
     A_row = [0, 1, 2, 3, 4]
     A_col = [1, 0, 1, 4, 3]
     A = ops.sp_matrix_to_sp_tensor(
-        sp.coo_matrix((A_data, (A_row, A_col)), shape=(5, 5))
+        sp.csr_matrix((A_data, (A_row, A_col)), shape=(5, 5))
     )
 
     expected_X = np.array([[[1., 0.],
diff --git a/tests/test_layers/test_ops.py b/tests/test_layers/test_ops.py
index 14f9ca13..f9806f0a 100644
--- a/tests/test_layers/test_ops.py
+++ b/tests/test_layers/test_ops.py
@@ -1,6 +1,6 @@
 import numpy as np
 import tensorflow as tf
-from scipy.sparse import coo_matrix
+from scipy.sparse import csr_matrix
 
 from spektral.layers import ops
 from spektral.utils import convolution
@@ -215,7 +215,7 @@ def test_modes_ops():
     A_data = [1, 1, 1, 1, 1]
     A_row = [0, 1, 2, 3, 4]
     A_col = [1, 0, 1, 4, 3]
-    A_sparse = coo_matrix((A_data, (A_row, A_col)), shape=(5, 5))
+    A_sparse = csr_matrix((A_data, (A_row, A_col)), shape=(5, 5))
     A_sparse_tensor = ops.sp_matrix_to_sp_tensor(A_sparse)
 
     # Disjoint signal to batch

From 53fe6409cc1ec077229b5fc2dec576b2dd2f90ed Mon Sep 17 00:00:00 2001
From: Daniele Grattarola <daniele.grattarola@gmail.com>
Date: Fri, 6 Nov 2020 12:51:02 +0100
Subject: [PATCH 29/57] Add transforms

---
 spektral/transforms/constant.py         | 15 +++++++++++++++
 spektral/transforms/normalize_one.py    | 10 ++++++++++
 spektral/transforms/normalize_sphere.py | 10 ++++++++++
 3 files changed, 35 insertions(+)
 create mode 100644 spektral/transforms/constant.py
 create mode 100644 spektral/transforms/normalize_one.py
 create mode 100644 spektral/transforms/normalize_sphere.py

diff --git a/spektral/transforms/constant.py b/spektral/transforms/constant.py
new file mode 100644
index 00000000..15264184
--- /dev/null
+++ b/spektral/transforms/constant.py
@@ -0,0 +1,15 @@
+import numpy as np
+
+
+class Constant(object):
+    def __init__(self, value):
+        self.value = value
+
+    def __call__(self, graph):
+        value = np.zeros((graph.N, 1)) + self.value
+        if graph.x is None:
+            graph.x = value
+        else:
+            graph.x = np.concatenate((graph.x, value), axis=-1)
+
+        return graph
diff --git a/spektral/transforms/normalize_one.py b/spektral/transforms/normalize_one.py
new file mode 100644
index 00000000..87cd9c7e
--- /dev/null
+++ b/spektral/transforms/normalize_one.py
@@ -0,0 +1,10 @@
+import numpy as np
+
+
+class NormalizeOne:
+    def __call__(self, graph):
+        x_sum = np.sum(graph.x, -1)
+        x_sum[x_sum == 0] = 1
+        graph.x = graph.x / x_sum[..., None]
+
+        return graph
diff --git a/spektral/transforms/normalize_sphere.py b/spektral/transforms/normalize_sphere.py
new file mode 100644
index 00000000..efa0bb65
--- /dev/null
+++ b/spektral/transforms/normalize_sphere.py
@@ -0,0 +1,10 @@
+import numpy as np
+
+
+class NormalizeSphere:
+    def __call__(self, graph):
+        offset = np.mean(graph.x, -2, keepdims=True)
+        scale = 1 / np.abs(graph.x).max()
+        graph.x = (graph.x - offset) * scale
+
+        return graph

From d8d522c85b995dd2d678dbf316091a6559fb7e05 Mon Sep 17 00:00:00 2001
From: Daniele Grattarola <daniele.grattarola@gmail.com>
Date: Sat, 7 Nov 2020 09:34:10 +0100
Subject: [PATCH 30/57] Update examples to work with tf 2.1

---
 examples/graph_prediction/custom_dataset.py   | 14 +++---
 .../graph_prediction/ogbg-mol-hiv_disjoint.py |  6 +--
 examples/graph_prediction/qm9_disjoint.py     | 12 ++----
 examples/graph_prediction/tud_disjoint.py     | 11 ++---
 examples/node_prediction/citation_arma.py     | 11 ++---
 examples/node_prediction/citation_cheby.py    | 29 +++++++------
 examples/node_prediction/citation_gat.py      | 43 ++++++++++---------
 examples/node_prediction/citation_gcn.py      | 26 +++++------
 .../node_prediction/citation_simple_gc.py     | 10 +++--
 examples/node_prediction/ogbn-arxiv_gcn.py    | 24 +++++------
 10 files changed, 89 insertions(+), 97 deletions(-)

diff --git a/examples/graph_prediction/custom_dataset.py b/examples/graph_prediction/custom_dataset.py
index c1188a38..2685f0a2 100644
--- a/examples/graph_prediction/custom_dataset.py
+++ b/examples/graph_prediction/custom_dataset.py
@@ -95,6 +95,10 @@ def make_graph():
 dataset_va = dataset[idx_va]
 dataset_te = dataset[idx_te]
 
+loader_tr = DisjointLoader(dataset_tr, batch_size=batch_size, epochs=epochs)
+loader_va = DisjointLoader(dataset_va, batch_size=batch_size)
+loader_te = DisjointLoader(dataset_te, batch_size=batch_size)
+
 ################################################################################
 # BUILD (unnecessarily big) MODEL
 ################################################################################
@@ -120,12 +124,7 @@ def make_graph():
 ################################################################################
 # FIT MODEL
 ################################################################################
-@tf.function(
-    input_signature=((tf.TensorSpec((None, F), dtype=tf.float64),
-                      tf.SparseTensorSpec((None, None), dtype=tf.float64),
-                      tf.TensorSpec((None,), dtype=tf.int64)),
-                     tf.TensorSpec((None, n_out), dtype=tf.float64)),
-    experimental_relax_shapes=True)
+@tf.function(input_signature=loader_tr.tf_signature(), experimental_relax_shapes=True)
 def train_step(inputs, target):
     with tf.GradientTape() as tape:
         predictions = model(inputs, training=True)
@@ -158,8 +157,6 @@ def evaluate(loader, ops_list):
 best_weights = None
 patience = es_patience
 
-loader_tr = DisjointLoader(dataset_tr, batch_size=batch_size, epochs=epochs)
-loader_va = DisjointLoader(dataset_va, batch_size=batch_size)
 for batch in loader_tr:
     outs = train_step(*batch)
 
@@ -196,6 +193,5 @@ def evaluate(loader, ops_list):
 ################################################################################
 print('Testing model')
 model.set_weights(best_weights)  # Load best model
-loader_te = DisjointLoader(dataset_te, batch_size=batch_size)
 test_loss, test_acc = evaluate(loader_te, [loss_fn, acc_fn])
 print('Done. Test loss: {:.4f}. Test acc: {:.2f}'.format(test_loss, test_acc))
diff --git a/examples/graph_prediction/ogbg-mol-hiv_disjoint.py b/examples/graph_prediction/ogbg-mol-hiv_disjoint.py
index 29649ffd..e73a26db 100644
--- a/examples/graph_prediction/ogbg-mol-hiv_disjoint.py
+++ b/examples/graph_prediction/ogbg-mol-hiv_disjoint.py
@@ -44,6 +44,9 @@
 dataset_va = dataset[idx_va]
 dataset_te = dataset[idx_te]
 
+loader_tr = DisjointLoader(dataset_tr, batch_size=batch_size, epochs=epochs)
+loader_te = DisjointLoader(dataset_te, batch_size=batch_size, epochs=1)
+
 ################################################################################
 # BUILD MODEL
 ################################################################################
@@ -86,10 +89,8 @@ def train_step(inputs, target):
 print('Fitting model')
 current_batch = 0
 model_loss = 0
-loader_tr = DisjointLoader(dataset_tr, batch_size=batch_size, epochs=epochs)
 for batch in loader_tr:
     outs = train_step(*batch)
-
     model_loss += outs
     current_batch += 1
     if current_batch == loader_tr.steps_per_epoch:
@@ -104,7 +105,6 @@ def train_step(inputs, target):
 evaluator = Evaluator(name=dataset_name)
 y_true = []
 y_pred = []
-loader_te = DisjointLoader(dataset_te, batch_size=batch_size, epochs=1)
 for batch in loader_te:
     inputs, target = batch
     p = model(inputs, training=False)
diff --git a/examples/graph_prediction/qm9_disjoint.py b/examples/graph_prediction/qm9_disjoint.py
index b02e65c4..4fb7cfa7 100644
--- a/examples/graph_prediction/qm9_disjoint.py
+++ b/examples/graph_prediction/qm9_disjoint.py
@@ -37,6 +37,8 @@
 idx_tr, idx_te = np.split(idxs, [split])
 dataset_tr, dataset_te = dataset[idx_tr], dataset[idx_te]
 
+loader_tr = DisjointLoader(dataset_tr, batch_size=batch_size, epochs=epochs)
+loader_te = DisjointLoader(dataset_te, batch_size=batch_size, epochs=1)
 
 ################################################################################
 # BUILD MODEL
@@ -60,13 +62,7 @@
 ################################################################################
 # FIT MODEL
 ################################################################################
-@tf.function(
-    input_signature=((tf.TensorSpec((None, F), dtype=tf.float64),
-                      tf.SparseTensorSpec((None, None), dtype=tf.int64),
-                      tf.TensorSpec((None, S), dtype=tf.float64),
-                      tf.TensorSpec((None,), dtype=tf.int64)),
-                     tf.TensorSpec((None, n_out), dtype=tf.float64)),
-    experimental_relax_shapes=True)
+@tf.function(input_signature=loader_tr.tf_signature(), experimental_relax_shapes=True)
 def train_step(inputs, target):
     with tf.GradientTape() as tape:
         predictions = model(inputs, training=True)
@@ -80,7 +76,6 @@ def train_step(inputs, target):
 print('Fitting model')
 current_batch = 0
 model_loss = 0
-loader_tr = DisjointLoader(dataset_tr, batch_size=batch_size, epochs=epochs)
 for batch in loader_tr:
     outs = train_step(*batch)
 
@@ -96,7 +91,6 @@ def train_step(inputs, target):
 ################################################################################
 print('Testing model')
 model_loss = 0
-loader_te = DisjointLoader(dataset_te, batch_size=batch_size, epochs=1)
 for batch in loader_te:
     inputs, target = batch
     predictions = model(inputs, training=False)
diff --git a/examples/graph_prediction/tud_disjoint.py b/examples/graph_prediction/tud_disjoint.py
index a1518a35..7d7ab1d7 100644
--- a/examples/graph_prediction/tud_disjoint.py
+++ b/examples/graph_prediction/tud_disjoint.py
@@ -40,6 +40,8 @@
 idx_tr, idx_te = np.split(idxs, [split])
 dataset_tr, dataset_te = dataset[idx_tr], dataset[idx_te]
 
+loader_tr = DisjointLoader(dataset_tr, batch_size=batch_size, epochs=epochs)
+loader_te = DisjointLoader(dataset_te, batch_size=batch_size, epochs=1)
 
 ################################################################################
 # BUILD MODEL
@@ -78,12 +80,7 @@ def call(self, inputs, **kwargs):
 ################################################################################
 # FIT MODEL
 ################################################################################
-@tf.function(
-    input_signature=((tf.TensorSpec((None, F), dtype=tf.float64),
-                      tf.SparseTensorSpec((None, None), dtype=tf.int64),
-                      tf.TensorSpec((None,), dtype=tf.int64)),
-                     tf.TensorSpec((None, n_out), dtype=tf.float64)),
-    experimental_relax_shapes=True)
+@tf.function(input_signature=loader_tr.tf_signature(), experimental_relax_shapes=True)
 def train_step(inputs, target):
     with tf.GradientTape() as tape:
         predictions = model(inputs, training=True)
@@ -98,7 +95,6 @@ def train_step(inputs, target):
 print('Fitting model')
 current_batch = 0
 model_lss = model_acc = 0
-loader_tr = DisjointLoader(dataset_tr, batch_size=batch_size, epochs=epochs)
 for batch in loader_tr:
     lss, acc = train_step(*batch)
 
@@ -117,7 +113,6 @@ def train_step(inputs, target):
 ################################################################################
 print('Testing model')
 model_lss = model_acc = 0
-loader_te = DisjointLoader(dataset_te, batch_size=batch_size, epochs=1)
 for batch in loader_te:
     inputs, target = batch
     predictions = model(inputs, training=False)
diff --git a/examples/node_prediction/citation_arma.py b/examples/node_prediction/citation_arma.py
index 3f1612cc..1c9e74ad 100644
--- a/examples/node_prediction/citation_arma.py
+++ b/examples/node_prediction/citation_arma.py
@@ -32,14 +32,15 @@
 learning_rate = 1e-2   # Learning rate
 epochs = 20000         # Number of training epochs
 patience = 100         # Patience for early stopping
+a_dtype = dataset[0].adj.dtype  # Only needed for TF 2.1
 
 N = dataset.N          # Number of nodes in the graph
 F = dataset.F          # Original size of node features
 n_out = dataset.n_out  # Number of classes
 
 # Model definition
-X_in = Input(shape=(F, ))
-fltr_in = Input((N, ), sparse=True)
+x_in = Input(shape=(F,))
+a_in = Input((N,), sparse=True, dtype=a_dtype)
 
 gc_1 = ARMAConv(channels,
                 iterations=iterations,
@@ -48,7 +49,7 @@
                 dropout_rate=dropout_skip,
                 activation='elu',
                 gcn_activation='elu',
-                kernel_regularizer=l2(l2_reg))([X_in, fltr_in])
+                kernel_regularizer=l2(l2_reg))([x_in, a_in])
 gc_2 = Dropout(dropout)(gc_1)
 gc_2 = ARMAConv(n_out,
                 iterations=1,
@@ -57,10 +58,10 @@
                 dropout_rate=dropout_skip,
                 activation='softmax',
                 gcn_activation=None,
-                kernel_regularizer=l2(l2_reg))([gc_2, fltr_in])
+                kernel_regularizer=l2(l2_reg))([gc_2, a_in])
 
 # Build model
-model = Model(inputs=[X_in, fltr_in], outputs=gc_2)
+model = Model(inputs=[x_in, a_in], outputs=gc_2)
 optimizer = Adam(learning_rate=learning_rate)
 model.compile(optimizer=optimizer,
               loss='categorical_crossentropy',
diff --git a/examples/node_prediction/citation_cheby.py b/examples/node_prediction/citation_cheby.py
index ff99e1b5..5322eb69 100644
--- a/examples/node_prediction/citation_cheby.py
+++ b/examples/node_prediction/citation_cheby.py
@@ -30,29 +30,30 @@
 learning_rate = 1e-2   # Learning rate
 epochs = 200           # Number of training epochs
 patience = 10          # Patience for early stopping
+a_dtype = dataset[0].adj.dtype  # Only needed for TF 2.1
 
 N = dataset.N          # Number of nodes in the graph
 F = dataset.F          # Original size of node features
 n_out = dataset.n_out  # Number of classes
 
 # Model definition
-X_in = Input(shape=(F, ))
-fltr_in = Input((N, ), sparse=True)
+x_in = Input(shape=(F,))
+a_in = Input((N,), sparse=True, dtype=a_dtype)
 
-dropout_1 = Dropout(dropout)(X_in)
-graph_conv_1 = ChebConv(channels,
-                        K=K,
-                        activation='relu',
-                        kernel_regularizer=l2(l2_reg),
-                        use_bias=False)([dropout_1, fltr_in])
-dropout_2 = Dropout(dropout)(graph_conv_1)
-graph_conv_2 = ChebConv(n_out,
-                        K=K,
-                        activation='softmax',
-                        use_bias=False)([dropout_2, fltr_in])
+do_1 = Dropout(dropout)(x_in)
+gc_1 = ChebConv(channels,
+                K=K,
+                activation='relu',
+                kernel_regularizer=l2(l2_reg),
+                use_bias=False)([do_1, a_in])
+do_2 = Dropout(dropout)(gc_1)
+gc_2 = ChebConv(n_out,
+                K=K,
+                activation='softmax',
+                use_bias=False)([do_2, a_in])
 
 # Build model
-model = Model(inputs=[X_in, fltr_in], outputs=graph_conv_2)
+model = Model(inputs=[x_in, a_in], outputs=gc_2)
 optimizer = Adam(lr=learning_rate)
 model.compile(optimizer=optimizer,
               loss='categorical_crossentropy',
diff --git a/examples/node_prediction/citation_gat.py b/examples/node_prediction/citation_gat.py
index a59458e7..12b36d45 100644
--- a/examples/node_prediction/citation_gat.py
+++ b/examples/node_prediction/citation_gat.py
@@ -29,36 +29,37 @@
 learning_rate = 5e-3   # Learning rate
 epochs = 20000         # Number of training epochs
 patience = 100         # Patience for early stopping
+a_dtype = dataset[0].adj.dtype  # Only needed for TF 2.1
 
 N = dataset.N          # Number of nodes in the graph
 F = dataset.F          # Original size of node features
 n_out = dataset.n_out  # Number of classes
 
 # Model definition
-X_in = Input(shape=(F, ))
-A_in = Input(shape=(N, ), sparse=True)
+x_in = Input(shape=(F,))
+a_in = Input((N,), sparse=True, dtype=a_dtype)
 
-dropout_1 = Dropout(dropout)(X_in)
-graph_attention_1 = GraphAttention(channels,
-                                   attn_heads=n_attn_heads,
-                                   concat_heads=True,
-                                   dropout_rate=dropout,
-                                   activation='elu',
-                                   kernel_regularizer=l2(l2_reg),
-                                   attn_kernel_regularizer=l2(l2_reg)
-                                   )([dropout_1, A_in])
-dropout_2 = Dropout(dropout)(graph_attention_1)
-graph_attention_2 = GraphAttention(n_out,
-                                   attn_heads=1,
-                                   concat_heads=False,
-                                   dropout_rate=dropout,
-                                   activation='softmax',
-                                   kernel_regularizer=l2(l2_reg),
-                                   attn_kernel_regularizer=l2(l2_reg)
-                                   )([dropout_2, A_in])
+do_1 = Dropout(dropout)(x_in)
+gc_1 = GraphAttention(channels,
+                      attn_heads=n_attn_heads,
+                      concat_heads=True,
+                      dropout_rate=dropout,
+                      activation='elu',
+                      kernel_regularizer=l2(l2_reg),
+                      attn_kernel_regularizer=l2(l2_reg)
+                      )([do_1, a_in])
+do_2 = Dropout(dropout)(gc_1)
+gc_2 = GraphAttention(n_out,
+                      attn_heads=1,
+                      concat_heads=False,
+                      dropout_rate=dropout,
+                      activation='softmax',
+                      kernel_regularizer=l2(l2_reg),
+                      attn_kernel_regularizer=l2(l2_reg)
+                      )([do_2, a_in])
 
 # Build model
-model = Model(inputs=[X_in, A_in], outputs=graph_attention_2)
+model = Model(inputs=[x_in, a_in], outputs=gc_2)
 optimizer = Adam(lr=learning_rate)
 model.compile(optimizer=optimizer,
               loss='categorical_crossentropy',
diff --git a/examples/node_prediction/citation_gcn.py b/examples/node_prediction/citation_gcn.py
index 55b6c5ed..74ae906d 100644
--- a/examples/node_prediction/citation_gcn.py
+++ b/examples/node_prediction/citation_gcn.py
@@ -15,6 +15,7 @@
 from spektral.datasets.citation import Citation
 from spektral.layers import GraphConv
 from spektral.transforms import LayerPreprocess, AdjToSpTensor
+import tensorflow as tf
 
 # Load data
 dataset = Citation('cora',
@@ -28,27 +29,28 @@
 learning_rate = 1e-2   # Learning rate
 epochs = 200           # Number of training epochs
 patience = 10          # Patience for early stopping
+a_dtype = dataset[0].adj.dtype  # Only needed for TF 2.1
 
 N = dataset.N          # Number of nodes in the graph
 F = dataset.F          # Original size of node features
 n_out = dataset.n_out  # Number of classes
 
 # Model definition
-X_in = Input(shape=(F, ))
-fltr_in = Input((N, ), sparse=True)
+x_in = Input(shape=(F,))
+a_in = Input((N,), sparse=True, dtype=a_dtype)
 
-dropout_1 = Dropout(dropout)(X_in)
-graph_conv_1 = GraphConv(channels,
-                         activation='relu',
-                         kernel_regularizer=l2(l2_reg),
-                         use_bias=False)([dropout_1, fltr_in])
-dropout_2 = Dropout(dropout)(graph_conv_1)
-graph_conv_2 = GraphConv(n_out,
-                         activation='softmax',
-                         use_bias=False)([dropout_2, fltr_in])
+do_1 = Dropout(dropout)(x_in)
+gc_1 = GraphConv(channels,
+                 activation='relu',
+                 kernel_regularizer=l2(l2_reg),
+                 use_bias=False)([do_1, a_in])
+do_2 = Dropout(dropout)(gc_1)
+gc_2 = GraphConv(n_out,
+                 activation='softmax',
+                 use_bias=False)([do_2, a_in])
 
 # Build model
-model = Model(inputs=[X_in, fltr_in], outputs=graph_conv_2)
+model = Model(inputs=[x_in, a_in], outputs=gc_2)
 optimizer = Adam(lr=learning_rate)
 model.compile(optimizer=optimizer,
               loss='categorical_crossentropy',
diff --git a/examples/node_prediction/citation_simple_gc.py b/examples/node_prediction/citation_simple_gc.py
index ea5fc03d..457a128b 100644
--- a/examples/node_prediction/citation_simple_gc.py
+++ b/examples/node_prediction/citation_simple_gc.py
@@ -45,21 +45,23 @@ def __call__(self, graph):
 learning_rate = 0.2    # Learning rate
 epochs = 20000         # Number of training epochs
 patience = 200         # Patience for early stopping
+a_dtype = dataset[0].adj.dtype  # Only needed for TF 2.1
 
 N = dataset.N          # Number of nodes in the graph
 F = dataset.F          # Original size of node features
 n_out = dataset.n_out  # Number of classes
 
 # Model definition
-X_in = Input(shape=(F, ))
-fltr_in = Input((N, ), sparse=True)
+x_in = Input(shape=(F,))
+a_in = Input((N,), sparse=True, dtype=a_dtype)
+
 output = GraphConv(n_out,
                    activation='softmax',
                    kernel_regularizer=l2(l2_reg),
-                   use_bias=False)([X_in, fltr_in])
+                   use_bias=False)([x_in, a_in])
 
 # Build model
-model = Model(inputs=[X_in, fltr_in], outputs=output)
+model = Model(inputs=[x_in, a_in], outputs=output)
 optimizer = Adam(lr=learning_rate)
 model.compile(optimizer=optimizer,
               loss='categorical_crossentropy',
diff --git a/examples/node_prediction/ogbn-arxiv_gcn.py b/examples/node_prediction/ogbn-arxiv_gcn.py
index 2bbb4e70..88055096 100644
--- a/examples/node_prediction/ogbn-arxiv_gcn.py
+++ b/examples/node_prediction/ogbn-arxiv_gcn.py
@@ -44,18 +44,18 @@
 masks = [mask_tr, mask_va, mask_te]
 
 # Model definition
-X_in = Input(shape=(F, ))
-fltr_in = Input((N, ), sparse=True)
-X_1 = GraphConv(channels, activation='relu')([X_in, fltr_in])
-X_1 = BatchNormalization()(X_1)
-X_1 = Dropout(dropout)(X_1)
-X_2 = GraphConv(channels, activation='relu')([X_1, fltr_in])
-X_2 = BatchNormalization()(X_2)
-X_2 = Dropout(dropout)(X_2)
-X_3 = GraphConv(n_out, activation='softmax')([X_2, fltr_in])
+x_in = Input(shape=(F,))
+a_in = Input((N,), sparse=True)
+x_1 = GraphConv(channels, activation='relu')([x_in, a_in])
+x_1 = BatchNormalization()(x_1)
+x_1 = Dropout(dropout)(x_1)
+x_2 = GraphConv(channels, activation='relu')([x_1, a_in])
+x_2 = BatchNormalization()(x_2)
+x_2 = Dropout(dropout)(x_2)
+x_3 = GraphConv(n_out, activation='softmax')([x_2, a_in])
 
 # Build model
-model = Model(inputs=[X_in, fltr_in], outputs=X_3)
+model = Model(inputs=[x_in, a_in], outputs=x_3)
 optimizer = Adam(lr=learning_rate)
 model.compile(optimizer=optimizer, loss=SparseCategoricalCrossentropy())
 model.summary()
@@ -63,8 +63,8 @@
 
 # Evaluation with OGB
 evaluator = Evaluator(dataset_name)
-def evaluate(X, fltr, y, model, masks, evaluator):
-    p = model.predict_on_batch([X, fltr])
+def evaluate(x, a, y, model, masks, evaluator):
+    p = model.predict_on_batch([x, a])
     p = p.argmax(-1)[:, None]
     tr_mask, va_mask, te_mask = masks
     tr_auc = evaluator.eval({'y_true': y[tr_mask], 'y_pred': p[tr_mask]})['acc']

From 283c73ea4cea9cf1af4d4a9e68ff1843856ff25c Mon Sep 17 00:00:00 2001
From: Daniele Grattarola <daniele.grattarola@gmail.com>
Date: Thu, 12 Nov 2020 18:00:51 +0100
Subject: [PATCH 31/57] Add documentation for Graph and Dataset Change Graph
 attributes

---
 examples/graph_prediction/custom_dataset.py   |   2 +-
 examples/node_prediction/citation_arma.py     |   2 +-
 examples/node_prediction/citation_cheby.py    |   2 +-
 examples/node_prediction/citation_gat.py      |   2 +-
 examples/node_prediction/citation_gat_fast.py |   2 +-
 examples/node_prediction/citation_gcn.py      |   2 +-
 examples/node_prediction/citation_gcn_fast.py |   2 +-
 .../node_prediction/citation_simple_gc.py     |   6 +-
 examples/node_prediction/ogbn-arxiv_gcn.py    |   2 +-
 examples/other/node_clustering_mincut.py      |   2 +-
 spektral/data/dataset.py                      | 160 ++++++++++++------
 spektral/data/graph.py                        |  87 +++++++---
 spektral/data/loaders.py                      |   4 +-
 spektral/datasets/citation.py                 |   8 +-
 spektral/datasets/graphsage.py                |   4 +-
 spektral/datasets/qm9.py                      |   2 +-
 spektral/datasets/tudataset.py                |   2 +-
 spektral/transforms/adj_to_sp_tensor.py       |   4 +-
 spektral/transforms/degree.py                 |   4 +-
 spektral/transforms/gcn_filter.py             |   4 +-
 spektral/transforms/layer_preprocess.py       |   4 +-
 spektral/transforms/normalize_adj.py          |   4 +-
 tests/test_data/test_dataset.py               |  10 +-
 tests/test_data/test_graph.py                 |   4 +-
 tests/test_data/test_loaders.py               |   8 +-
 tests/test_data/test_utils.py                 |   4 +-
 26 files changed, 214 insertions(+), 123 deletions(-)

diff --git a/examples/graph_prediction/custom_dataset.py b/examples/graph_prediction/custom_dataset.py
index 2685f0a2..eb689270 100644
--- a/examples/graph_prediction/custom_dataset.py
+++ b/examples/graph_prediction/custom_dataset.py
@@ -75,7 +75,7 @@ def make_graph():
             color_counts = x.sum(0)
             y[np.argmax(color_counts)] = 1
 
-            return Graph(x=x, adj=a, y=y)
+            return Graph(x=x, a=a, y=y)
 
         # We must return a list of Graph objects
         return [make_graph() for _ in range(self.n_graphs)]
diff --git a/examples/node_prediction/citation_arma.py b/examples/node_prediction/citation_arma.py
index 1c9e74ad..0aeb6d82 100644
--- a/examples/node_prediction/citation_arma.py
+++ b/examples/node_prediction/citation_arma.py
@@ -32,7 +32,7 @@
 learning_rate = 1e-2   # Learning rate
 epochs = 20000         # Number of training epochs
 patience = 100         # Patience for early stopping
-a_dtype = dataset[0].adj.dtype  # Only needed for TF 2.1
+a_dtype = dataset[0].a.dtype  # Only needed for TF 2.1
 
 N = dataset.N          # Number of nodes in the graph
 F = dataset.F          # Original size of node features
diff --git a/examples/node_prediction/citation_cheby.py b/examples/node_prediction/citation_cheby.py
index 5322eb69..644f5c8d 100644
--- a/examples/node_prediction/citation_cheby.py
+++ b/examples/node_prediction/citation_cheby.py
@@ -30,7 +30,7 @@
 learning_rate = 1e-2   # Learning rate
 epochs = 200           # Number of training epochs
 patience = 10          # Patience for early stopping
-a_dtype = dataset[0].adj.dtype  # Only needed for TF 2.1
+a_dtype = dataset[0].a.dtype  # Only needed for TF 2.1
 
 N = dataset.N          # Number of nodes in the graph
 F = dataset.F          # Original size of node features
diff --git a/examples/node_prediction/citation_gat.py b/examples/node_prediction/citation_gat.py
index 12b36d45..ea484915 100644
--- a/examples/node_prediction/citation_gat.py
+++ b/examples/node_prediction/citation_gat.py
@@ -29,7 +29,7 @@
 learning_rate = 5e-3   # Learning rate
 epochs = 20000         # Number of training epochs
 patience = 100         # Patience for early stopping
-a_dtype = dataset[0].adj.dtype  # Only needed for TF 2.1
+a_dtype = dataset[0].a.dtype  # Only needed for TF 2.1
 
 N = dataset.N          # Number of nodes in the graph
 F = dataset.F          # Original size of node features
diff --git a/examples/node_prediction/citation_gat_fast.py b/examples/node_prediction/citation_gat_fast.py
index 6642efb3..7a8c2902 100644
--- a/examples/node_prediction/citation_gat_fast.py
+++ b/examples/node_prediction/citation_gat_fast.py
@@ -20,7 +20,7 @@
 # Load data
 dataset = Cora(transforms=[LayerPreprocess(GraphAttention), AdjToSpTensor()])
 graph = dataset[0]
-x, a, y = graph.x, graph.adj, graph.y
+x, a, y = graph.x, graph.a, graph.y
 mask_tr, mask_va, mask_te = dataset.mask_tr, dataset.mask_va, dataset.mask_te
 
 # Define model
diff --git a/examples/node_prediction/citation_gcn.py b/examples/node_prediction/citation_gcn.py
index 74ae906d..c4e62475 100644
--- a/examples/node_prediction/citation_gcn.py
+++ b/examples/node_prediction/citation_gcn.py
@@ -29,7 +29,7 @@
 learning_rate = 1e-2   # Learning rate
 epochs = 200           # Number of training epochs
 patience = 10          # Patience for early stopping
-a_dtype = dataset[0].adj.dtype  # Only needed for TF 2.1
+a_dtype = dataset[0].a.dtype  # Only needed for TF 2.1
 
 N = dataset.N          # Number of nodes in the graph
 F = dataset.F          # Original size of node features
diff --git a/examples/node_prediction/citation_gcn_fast.py b/examples/node_prediction/citation_gcn_fast.py
index b64047be..0a3044f2 100644
--- a/examples/node_prediction/citation_gcn_fast.py
+++ b/examples/node_prediction/citation_gcn_fast.py
@@ -19,7 +19,7 @@
 # Load data
 dataset = Cora(transforms=[LayerPreprocess(GraphConv), AdjToSpTensor()])
 graph = dataset[0]
-x, a, y = graph.x, graph.adj, graph.y
+x, a, y = graph.x, graph.a, graph.y
 mask_tr, mask_va, mask_te = dataset.mask_tr, dataset.mask_va, dataset.mask_te
 
 # Define model
diff --git a/examples/node_prediction/citation_simple_gc.py b/examples/node_prediction/citation_simple_gc.py
index 457a128b..70abef4b 100644
--- a/examples/node_prediction/citation_simple_gc.py
+++ b/examples/node_prediction/citation_simple_gc.py
@@ -26,11 +26,11 @@ def __init__(self, K):
         self.K = K
 
     def __call__(self, graph):
-        out = graph.adj
+        out = graph.a
         for i in range(self.K - 1):
             out = out.dot(out)
         out.sort_indices()
-        graph.adj = out
+        graph.a = out
         return graph
 
 
@@ -45,7 +45,7 @@ def __call__(self, graph):
 learning_rate = 0.2    # Learning rate
 epochs = 20000         # Number of training epochs
 patience = 200         # Patience for early stopping
-a_dtype = dataset[0].adj.dtype  # Only needed for TF 2.1
+a_dtype = dataset[0].a.dtype  # Only needed for TF 2.1
 
 N = dataset.N          # Number of nodes in the graph
 F = dataset.F          # Original size of node features
diff --git a/examples/node_prediction/ogbn-arxiv_gcn.py b/examples/node_prediction/ogbn-arxiv_gcn.py
index 88055096..b17ac4f1 100644
--- a/examples/node_prediction/ogbn-arxiv_gcn.py
+++ b/examples/node_prediction/ogbn-arxiv_gcn.py
@@ -20,7 +20,7 @@
 ogb_dataset = NodePropPredDataset(dataset_name)
 dataset = OGB(ogb_dataset, transforms=[GCNFilter(), AdjToSpTensor()])
 graph = dataset[0]
-x, adj, y = graph.x, graph.adj, graph.y
+x, adj, y = graph.x, graph.a, graph.y
 
 # Parameters
 channels = 256                   # Number of channels for GCN layers
diff --git a/examples/other/node_clustering_mincut.py b/examples/other/node_clustering_mincut.py
index a259504b..62af8d2e 100644
--- a/examples/other/node_clustering_mincut.py
+++ b/examples/other/node_clustering_mincut.py
@@ -39,7 +39,7 @@ def train_step(inputs):
 # LOAD DATASET
 ################################################################################
 dataset = Cora()
-adj, x, y = dataset[0].adj, dataset[0].x, dataset[0].y
+adj, x, y = dataset[0].a, dataset[0].x, dataset[0].y
 a_norm = normalized_adjacency(adj)
 a_norm = sp_matrix_to_sp_tensor(a_norm)
 F = dataset.F
diff --git a/spektral/data/dataset.py b/spektral/data/dataset.py
index bdfa436c..89bcb1fa 100644
--- a/spektral/data/dataset.py
+++ b/spektral/data/dataset.py
@@ -14,60 +14,103 @@ class Dataset:
     A container for Graph objects. This class can be extended to represent a
     graph dataset.
 
-    To extend this class, you must implement the `Dataset.read()` method, which
+    Datasets can be accessed with indices (`dataset[0]` returns a `Graph`),
+    iterables  (`dataset[[1, 2, 3]]` returns a `Dataset`) or slices
+    (`dataset[start:stop]` also returns a `Dataset`).
+    They can also be shuffled (`np.random.shuffle(dataset)` shuffles in-place),
+    and iterated over (`for graph in dataset: ...`).
+
+    They should generally behave like Numpy arrays for any operation that uses
+    simple 1D indexing.
+
+    Datasets have the following properties that automatically computed from the
+    graphs:
+
+        - `N`: the number of nodes in the dataset (returns `None` if the number
+        changes between graphs);
+        - `F`: the size of the node features (returns `None` if the size changes
+        between graphs or is not defined);
+        - `S`: the size of the edge features (returns `None` if the size changes
+        between graphs or is not defined);
+        - `n_labels`: the size of the labels (returns `None` if the size changes
+        between graphs or is not defined); this is computed as the innermost
+        dimension of the labels (i.e., `y.shape[-1]`).
+
+    Any additional `kwargs` passed to the constructor will be automatically
+    assigned as instance attributes of the dataset.
+
+    Datasets also offer three main manipulation functions to apply callables to
+    their graphs:
+
+    - `apply(transform)`: replaces each graph with the output of
+    `transform(graph)`. This should always be a `Graph` object, although no
+    checks are made to ensure it (to give you more flexibility). See
+    `spektral.transforms` for some ready-to-use transforms.
+    For example: `apply(spektral.transforms.NormalizeAdj())` normalizes the
+    adjacency matrix of each graph in the dataset.
+    - `map(transform, reduce=None)`: returns a list containing the output
+    of `transform(graph)` for each graph. If `reduce` is a `callable`, then
+    returns `reduce(output_list)` instead of just `output_list`.
+    For instance: `map(lambda: g.N, reduce=np.mean)` will return the average
+    number of nodes in the dataset.
+    - `filter(function)`: removes from the dataset any graph for which
+    `function(graph)` returns `False`.
+    For example: `filter(lambda: g.N < 100)` removes from the dataset all graphs
+    bigger than 100 nodes.
+
+    You can extend this class to create your own dataset.
+    To create a `Dataset`, you must implement the `Dataset.read()` method, which
     must return a list of `spektral.data.Graph` objects, e.g.,
 
     ```
     class MyDataset(Dataset):
         def read(self):
-            return [
-                Graph(x=np.random.rand(n, 2),
-                      adj=np.random.randint(0, 2, (n, n)),
-                      y=np.array([0., 1.]))
-                for n in range(size)
-            ]
+            return [Graph(x=x, adj=adj, y=y) for x, adj, y in some_magic_list]
     ```
 
-    Datasets can be sliced (`dataset[start:stop]`), shuffled
-    (`np.random.shuffle(dataset)`), and iterated (`for graph in dataset: ...`).
+    The class also offers a `download()` method that is automatically called
+    if the path returned by the `Dataset.path` attribute does not exists.
+    This defaults to `~/.spektral/datasets/ClassName/'.
 
-    The size of the node features, edge features and targets is shared by all
-    graphs in a dataset and can be accessed respectively with:
+    You can implement this however you like, knowing that `download()` will be
+    called before `read()`. You can also override the `path` attribute to
+    whatever fits your needs.
 
-    ```
-    >>> dataset.F
-    >>> dataset.S
-    >>> dataset.n_out
-    ```
+    Have a look at the `spektral.datasets` module for examples of popular
+    datasets already implemented.
+
+    **Arguments**
+
+    - `transforms`: a callable or list of callables that are automatically
+    applied to the graphs after loading the dataset.
     """
     def __init__(self, transforms=None, **kwargs):
+
+        # Read extra kwargs
+        for k, v in kwargs.items():
+            setattr(self, k, v)
+
+        # Download data
         if not osp.exists(self.path):
             self.download()
+
+        # Read graphs
         self.graphs = self.read()
-        # Make sure that we always have at least one graph
         if len(self.graphs) == 0:
             raise ValueError('Datasets cannot be empty')
 
-        # Read extra kwargs
-        for k, v in kwargs.items():
-            setattr(self, k, v)
-
         # Apply transforms
         if transforms is not None:
             if not isinstance(transforms, (list, tuple)) and callable(transforms):
                 transforms = [transforms]
             elif not all([callable(t) for t in transforms]):
-                raise ValueError('transforms must be a list of callables or '
-                                 'a callable.')
+                raise ValueError('`transforms` must be a callable or list of '
+                                 'callables')
             else:
                 pass
             for t in transforms:
                 self.apply(t)
 
-    @property
-    def path(self):
-        return osp.join(DATASET_FOLDER, self.__class__.__name__)
-
     def read(self):
         raise NotImplementedError
 
@@ -75,17 +118,24 @@ def download(self):
         pass
 
     def apply(self, transform):
+        if not callable(transform):
+            raise ValueError('`transform` must be callable')
+
         for i in range(len(self.graphs)):
             self.graphs[i] = transform(self.graphs[i])
 
     def map(self, transform, reduce=None):
+        if not callable(transform):
+            raise ValueError('`transform` must be callable')
+        if reduce is not None and not callable(reduce):
+            raise ValueError('`reduce` must be callable')
+
         out = [transform(g) for g in self.graphs]
-        if reduce is not None and callable(reduce):
-            return reduce(out)
-        else:
-            return out
+        return reduce(out) if reduce is not None else out
 
     def filter(self, function):
+        if not callable(function):
+            raise ValueError('`function` must be callable')
         self.graphs = [g for g in self.graphs if function(g)]
 
     def __getitem__(self, key):
@@ -131,6 +181,10 @@ def __len__(self):
     def __repr__(self):
         return '{}({})'.format(self.__class__.__name__, self.__len__())
 
+    @property
+    def path(self):
+        return osp.join(DATASET_FOLDER, self.__class__.__name__)
+
     @property
     def N(self):
         if len(self.graphs) == 1 or len(set([g.N for g in self.graphs])) == 1:
@@ -140,42 +194,50 @@ def N(self):
 
     @property
     def F(self):
-        return self.graphs[0].F
+        if len(self.graphs) == 1 or len(set([g.F for g in self.graphs])) == 1:
+            return self.graphs[0].F
+        else:
+            return None
 
     @property
     def S(self):
-        return self.graphs[0].S
+        if len(self.graphs) == 1 or len(set([g.S for g in self.graphs])) == 1:
+            return self.graphs[0].S
+        else:
+            return None
 
     @property
     def n_out(self):
-        y = self.graphs[0].y
-        if y is None:
-            return None
+        if len(self.graphs) == 1 or len(set([g.n_labels for g in self.graphs])) == 1:
+            return self.graphs[0].n_labels
         else:
-            shp = np.shape(y)
-            if len(shp) == 0:
-                return 1
-            else:
-                return shp[-1]
+            return None
 
+    @property
     def signature(self):
+        """
+        This property computes the signature of the dataset, which can be
+        passed to `spektral.data.utils.to_tf_signature(signature)` to compute
+        the TensorFlow signature. You can safely ignore this property unless
+        you are creating a custom `Loader`.
+        """
         signature = {}
         graph = self.graphs[0]  # This is always non-empty
         if graph.x is not None:
             signature['x'] = dict()
             signature['x']['spec'] = get_spec(graph.x)
-            signature['x']['shape'] = (None, graph.F)
+            signature['x']['shape'] = (None, self.F)
             signature['x']['dtype'] = tf.as_dtype(graph.x.dtype)
-        if graph.adj is not None:
+        if graph.a is not None:
             signature['a'] = dict()
-            signature['a']['spec'] = get_spec(graph.adj)
+            signature['a']['spec'] = get_spec(graph.a)
             signature['a']['shape'] = (None, None)
-            signature['a']['dtype'] = tf.as_dtype(graph.adj.dtype)
-        if graph.edge_attr is not None:
+            signature['a']['dtype'] = tf.as_dtype(graph.a.dtype)
+        if graph.e is not None:
             signature['e'] = dict()
-            signature['e']['spec'] = get_spec(graph.edge_attr)
-            signature['e']['shape'] = (None, graph.S)
-            signature['e']['dtype'] = tf.as_dtype(graph.edge_attr.dtype)
+            signature['e']['spec'] = get_spec(graph.e)
+            signature['e']['shape'] = (None, self.S)
+            signature['e']['dtype'] = tf.as_dtype(graph.e.dtype)
         if graph.y is not None:
             signature['y'] = dict()
             signature['y']['spec'] = get_spec(graph.y)
diff --git a/spektral/data/graph.py b/spektral/data/graph.py
index 2f8c6c9c..cdf3fcff 100644
--- a/spektral/data/graph.py
+++ b/spektral/data/graph.py
@@ -1,37 +1,67 @@
 import numpy as np
-import scipy.sparse as sp
+
 
 class Graph:
     """
-    A container to represent a graph with:
-        - node features;
-        - adjacency matrix;
-        - edge attributes;
-        - node or graph labels;
+    A container to represent a graph. The data associated with the Graph is
+    stored in its attributes:
+
+        - `x`, for the node features;
+        - `a`, for the adjacency matrix;
+        - `e`, for the edge attributes;
+        - `y`, for the node or graph labels;
 
-    See the [data representation page](https://graphneural.network/data/) for
-    more info.
+    All of these default to `None` if you don't specify them in the constructor.
+    If you want to read all non-None attributes at once, you can call the
+    `numpy()` method, which will return all data in a tuple (with the order
+    defined above).
 
-    This class exposes the following attributes:
+    Graphs also have the following attributes that are computed automatically
+    from the data:
 
     - `N`: number of nodes;
-    - `F`: size of the node features;
-    - `S`: size of the edge features;
+    - `F`: size of the node features, if available;
+    - `S`: size of the edge features, if available;
+    - `n_labels`: size of the labels, if available;
+
+    Any additional `kwargs` passed to the constructor will be automatically
+    assigned as instance attributes of the graph.
+
+    Data can be stored in Numpy arrays or Scipy sparse matrices, and labels can
+    also be scalars.
+
+    Spektral usually assumes that the different data matrices have specific
+    shapes, although this is not strictly enforced to allow more flexibility.
+    In general, node attributes should have shape `(N, F)` and the adjacency
+    matrix should have shape `(N, N)`.
+
+    A Graph should always have either the node features or the adjacency matrix.
+
+    Edge attributes can be stored in a dense format as arrays of shape
+    `(N, N, S)` or in a sparse format as arrays of shape `(n_edges, S)`
+    (so that you don't have to store all the zeros for missing edges). Most
+    components of Spektral will know how to deal with both situations
+    automatically.
+
+    Labels can refer to the entire graph (shape `(n_labels, )`) or to each
+    individual node (shape `(N, n_labels)`).
 
     **Arguments**
 
     - `x`: np.array, the node features (shape `(N, F)`);
-    - `adj`: np.array or scipy.sparse matrix, the adjacency matrix (shape `(N, N)`);
-    - `edge_attr`: np.array, the edge features (shape `(N, N, S)`);
-    - `y`: np.array, the node or graph labels (shape `(N, n_labels)` or
-           `(n_labels, )`);
+    - `a`: np.array or scipy.sparse matrix, the adjacency matrix (shape `(N, N)`);
+    - `e`: np.array, the edge features (shape `(N, N, S)` or `(n_edges, S)`);
+    - `y`: np.array, the node or graph labels (shape `(N, n_labels)` or `(n_labels, )`);
 
 
     """
-    def __init__(self, x=None, adj=None, edge_attr=None, y=None, **kwargs):
+    def __init__(self, x=None, a=None, e=None, y=None, **kwargs):
+        if x is None and a is None:
+            raise ValueError('A Graph should have either node attributes or '
+                             'an adjacency matrix. Got both None.')
         self.x = x
-        self.adj = adj
-        self.edge_attr = edge_attr
+        self.a = a
+        self.e = e
         self.y = y
 
         # Read extra kwargs
@@ -39,7 +69,7 @@ def __init__(self, x=None, adj=None, edge_attr=None, y=None, **kwargs):
             self[k] = v
 
     def numpy(self):
-        return tuple(ret for ret in [self.x, self.adj, self.edge_attr, self.y]
+        return tuple(ret for ret in [self.x, self.a, self.e, self.y]
                      if ret is not None)
 
     def __setitem__(self, key, value):
@@ -49,15 +79,15 @@ def __getitem__(self, key):
         return getattr(self, key, None)
 
     def __repr__(self):
-        return 'Graph(N={}, F={}, S={}, y={}'\
+        return 'Graph(N={}, F={}, S={}, y={})'\
                .format(self.N, self.F, self.S, self.y)
 
     @property
     def N(self):
         if self.x is not None:
             return self.x.shape[-2]
-        elif self.adj is not None:
-            return self.adj.shape[-1]
+        elif self.a is not None:
+            return self.a.shape[-1]
         else:
             return None
 
@@ -70,7 +100,16 @@ def F(self):
 
     @property
     def S(self):
-        if self.edge_attr is not None:
-            return self.edge_attr.shape[-1]
+        if self.e is not None:
+            return self.e.shape[-1]
+        else:
+            return None
+
+    @property
+    def n_labels(self):
+        if self.y is not None:
+            shp = np.shape(self.y)
+            return 1 if len(shp) == 0 else shp[-1]
         else:
             return None
+
diff --git a/spektral/data/loaders.py b/spektral/data/loaders.py
index 0744bdce..2a07b6c8 100644
--- a/spektral/data/loaders.py
+++ b/spektral/data/loaders.py
@@ -105,7 +105,7 @@ def tf(self):
             lambda: (_ for _ in self), output_signature=self.tf_signature())
 
     def tf_signature(self):
-        signature = self.dataset.signature()
+        signature = self.dataset.signature
         if 'y' in signature:
             signature['y']['shape'] = prepend_none(signature['y']['shape'])
         if 'a' in signature:
@@ -138,7 +138,7 @@ def tf(self):
             lambda: (_ for _ in self), output_signature=self.tf_signature())
 
     def tf_signature(self):
-        signature = self.dataset.signature()
+        signature = self.dataset.signature
         for k in signature:
             signature[k]['shape'] = prepend_none(signature[k]['shape'])
         if 'a' in signature:
diff --git a/spektral/datasets/citation.py b/spektral/datasets/citation.py
index 06c2b51d..706c1a79 100644
--- a/spektral/datasets/citation.py
+++ b/spektral/datasets/citation.py
@@ -92,16 +92,16 @@ def read(self):
                 idx_te, y_te, train_size=30 * n_classes, stratify=y_te)
 
         # Adjacency matrix
-        adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))  # CSR
-        adj.setdiag(0)
-        adj.eliminate_zeros()
+        a = nx.adjacency_matrix(nx.from_dict_of_lists(graph))  # CSR
+        a.setdiag(0)
+        a.eliminate_zeros()
 
         # Train/valid/test masks
         self.mask_tr = _idx_to_mask(idx_tr, y.shape[0])
         self.mask_va = _idx_to_mask(idx_va, y.shape[0])
         self.mask_te = _idx_to_mask(idx_te, y.shape[0])
 
-        return [Graph(x=x, adj=adj, y=y)]
+        return [Graph(x=x, a=a, y=y)]
 
     def download(self):
         print('Downloading {} dataset.'.format(self.name))
diff --git a/spektral/datasets/graphsage.py b/spektral/datasets/graphsage.py
index 78237a10..2a111749 100644
--- a/spektral/datasets/graphsage.py
+++ b/spektral/datasets/graphsage.py
@@ -66,7 +66,7 @@ def read(self):
         npz_file = osp.join(self.path, self.name) + '.npz'
         data = np.load(npz_file)
         x = data['x']
-        adj = sp.csr_matrix(
+        a = sp.csr_matrix(
             (data['adj_data'], (data['adj_row'], data['adj_col'])),
             shape=data['adj_shape']
         )
@@ -75,7 +75,7 @@ def read(self):
         self.mask_va = data['mask_va']
         self.mask_te = data['mask_te']
 
-        return [Graph(x=x, adj=adj, y=y)]
+        return [Graph(x=x, a=a, y=y)]
 
     def download(self):
         print('Downloading {} dataset.'.format(self.name))
diff --git a/spektral/datasets/qm9.py b/spektral/datasets/qm9.py
index 420066ae..5387dcd7 100644
--- a/spektral/datasets/qm9.py
+++ b/spektral/datasets/qm9.py
@@ -71,7 +71,7 @@ def read(self):
         if self.amount is not None:
             labels = labels[:self.amount]
 
-        return [Graph(x=x, adj=a, edge_attr=e, y=y)
+        return [Graph(x=x, a=a, e=e, y=y)
                 for x, a, e, y in zip(x_list, a_list, e_list, labels)]
 
 
diff --git a/spektral/datasets/tudataset.py b/spektral/datasets/tudataset.py
index 48443b56..8dc8a3a1 100644
--- a/spektral/datasets/tudataset.py
+++ b/spektral/datasets/tudataset.py
@@ -176,7 +176,7 @@ def read(self):
 
         # Convert to Graph
         print('Successfully loaded {}.'.format(self.name))
-        return [Graph(x=x, adj=a, edge_attr=e, y=y)
+        return [Graph(x=x, a=a, e=e, y=y)
                 for x, a, e, y in zip(x_list, a_list, e_list, labels)]
 
     def available_datasets(self):
diff --git a/spektral/transforms/adj_to_sp_tensor.py b/spektral/transforms/adj_to_sp_tensor.py
index 212eef9f..8449cb66 100644
--- a/spektral/transforms/adj_to_sp_tensor.py
+++ b/spektral/transforms/adj_to_sp_tensor.py
@@ -3,7 +3,7 @@
 
 class AdjToSpTensor(object):
     def __call__(self, graph):
-        if graph.adj is not None:
-            graph.adj = sp_matrix_to_sp_tensor(graph.adj)
+        if graph.a is not None:
+            graph.a = sp_matrix_to_sp_tensor(graph.a)
 
         return graph
diff --git a/spektral/transforms/degree.py b/spektral/transforms/degree.py
index 0b44a793..cb54958f 100644
--- a/spektral/transforms/degree.py
+++ b/spektral/transforms/degree.py
@@ -8,7 +8,7 @@ def __init__(self, max_degree):
         self.max_degree = max_degree
 
     def __call__(self, graph):
-        degree = graph.adj.sum(1)
+        degree = graph.a.sum(1)
         degree = one_hot(degree, self.max_degree + 1)
         if graph.x is None:
             graph.x = degree
@@ -20,4 +20,4 @@ def __call__(self, graph):
 
 class MaxDegree(object):
     def __call__(self, graph):
-        return graph.adj.sum(1).max()
+        return graph.a.sum(1).max()
diff --git a/spektral/transforms/gcn_filter.py b/spektral/transforms/gcn_filter.py
index e2168425..fff65702 100644
--- a/spektral/transforms/gcn_filter.py
+++ b/spektral/transforms/gcn_filter.py
@@ -6,7 +6,7 @@ def __init__(self, symmetric=True):
         self.symmetric = symmetric
 
     def __call__(self, graph):
-        if graph.adj is not None:
-            graph.adj = gcn_filter(graph.adj, self.symmetric)
+        if graph.a is not None:
+            graph.a = gcn_filter(graph.a, self.symmetric)
 
         return graph
diff --git a/spektral/transforms/layer_preprocess.py b/spektral/transforms/layer_preprocess.py
index 482577c6..dced889b 100644
--- a/spektral/transforms/layer_preprocess.py
+++ b/spektral/transforms/layer_preprocess.py
@@ -12,7 +12,7 @@ def __init__(self, layer_class):
         self.layer_class = layer_class
 
     def __call__(self, graph):
-        if graph.adj is not None and hasattr(self.layer_class, 'preprocess'):
-            graph.adj = self.layer_class.preprocess(graph.adj)
+        if graph.a is not None and hasattr(self.layer_class, 'preprocess'):
+            graph.a = self.layer_class.preprocess(graph.a)
 
         return graph
diff --git a/spektral/transforms/normalize_adj.py b/spektral/transforms/normalize_adj.py
index 308be920..f83528ba 100644
--- a/spektral/transforms/normalize_adj.py
+++ b/spektral/transforms/normalize_adj.py
@@ -6,7 +6,7 @@ def __init__(self, symmetric=True):
         self.symmetric = symmetric
 
     def __call__(self, graph):
-        if graph.adj is not None:
-            graph.adj = normalized_adjacency(graph.adj, self.symmetric)
+        if graph.a is not None:
+            graph.a = normalized_adjacency(graph.a, self.symmetric)
 
         return graph
diff --git a/tests/test_data/test_dataset.py b/tests/test_data/test_dataset.py
index cddefe14..6ccfcbaf 100644
--- a/tests/test_data/test_dataset.py
+++ b/tests/test_data/test_dataset.py
@@ -12,9 +12,7 @@
 class TestDataset(Dataset):
     def read(self):
         return [
-            Graph(x=np.random.rand(n, f),
-                  adj=np.random.randint(0, 2, (n, n)),
-                  edge_attr=np.random.rand(n, n, s),
+            Graph(x=np.random.rand(n, f), a=np.random.randint(0, 2, (n, n)), e=np.random.rand(n, n, s),
                   y=np.array([0., 1.]))
             for n in Ns
         ]
@@ -29,7 +27,7 @@ def test_dataset():
 
     # signature
     for k in ['x', 'a', 'e', 'y']:
-        assert k in d.signature()
+        assert k in d.signature
 
     # __getitem__
     assert isinstance(d[0], Graph)
@@ -38,9 +36,7 @@ def test_dataset():
 
     # __setitem__
     n = 100
-    g = Graph(x=np.random.rand(n, f),
-              adj=np.random.randint(0, 2, (n, n)),
-              edge_attr=np.random.rand(n, n, s),
+    g = Graph(x=np.random.rand(n, f), a=np.random.randint(0, 2, (n, n)), e=np.random.rand(n, n, s),
               y=np.array([0., 1.]))
 
     # single assignment
diff --git a/tests/test_data/test_graph.py b/tests/test_data/test_graph.py
index 65d9012b..ddc53284 100644
--- a/tests/test_data/test_graph.py
+++ b/tests/test_data/test_graph.py
@@ -10,8 +10,8 @@
 
 def _check_graph(x, a, e, y):
     g = Graph(x=x)
-    g = Graph(adj=a)
-    g = Graph(x=x, adj=a, edge_attr=e, y=y)
+    g = Graph(a=a)
+    g = Graph(x=x, a=a, e=e, y=y)
 
     # numpy
     g_np = g.numpy()
diff --git a/tests/test_data/test_loaders.py b/tests/test_data/test_loaders.py
index 65f06eb8..c420fd00 100644
--- a/tests/test_data/test_loaders.py
+++ b/tests/test_data/test_loaders.py
@@ -24,9 +24,7 @@ class TestDatasetSingle(Dataset):
     def read(self):
         n = 10
         return [
-            Graph(x=np.random.rand(n, f),
-                  adj=sp.csr_matrix(np.random.randint(0, 2, (n, n))),
-                  edge_attr=np.random.rand(n, n, s),
+            Graph(x=np.random.rand(n, f), a=sp.csr_matrix(np.random.randint(0, 2, (n, n))), e=np.random.rand(n, n, s),
                   y=np.array(n * [[0., 1.]]))
         ]
 
@@ -37,9 +35,7 @@ class TestDataset(Dataset):
     """
     def read(self):
         return [
-            Graph(x=np.random.rand(n, f),
-                  adj=sp.csr_matrix(np.random.randint(0, 2, (n, n))),
-                  edge_attr=np.random.rand(n, n, s),
+            Graph(x=np.random.rand(n, f), a=sp.csr_matrix(np.random.randint(0, 2, (n, n))), e=np.random.rand(n, n, s),
                   y=np.array([0., 1.]))
             for n in ns
         ]
diff --git a/tests/test_data/test_utils.py b/tests/test_data/test_utils.py
index 235f296f..fea2f3e7 100644
--- a/tests/test_data/test_utils.py
+++ b/tests/test_data/test_utils.py
@@ -38,9 +38,7 @@ def test_batch_generator():
     class TestDataset(Dataset):
         def read(self):
             return [
-                Graph(x=np.random.rand(n, 2),
-                      adj=np.random.randint(0, 2, (n, n)),
-                      y=np.array([0., 1.]))
+                Graph(x=np.random.rand(n, 2), a=np.random.randint(0, 2, (n, n)), y=np.array([0., 1.]))
                 for n in range(size)
             ]
 

From 77419d5b9a6eb26a09cb7405f2209ddafbba0ad7 Mon Sep 17 00:00:00 2001
From: Daniele Grattarola <daniele.grattarola@gmail.com>
Date: Fri, 13 Nov 2020 10:25:34 +0100
Subject: [PATCH 32/57] Remove useless E from MP layers

---
 spektral/layers/convolutional/agnn_conv.py        | 6 +++---
 spektral/layers/convolutional/gated_graph_conv.py | 8 ++++----
 spektral/layers/convolutional/gin_conv.py         | 4 ++--
 spektral/layers/convolutional/tag_conv.py         | 8 ++++----
 4 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/spektral/layers/convolutional/agnn_conv.py b/spektral/layers/convolutional/agnn_conv.py
index 4ae9fea2..b7b3a7bb 100644
--- a/spektral/layers/convolutional/agnn_conv.py
+++ b/spektral/layers/convolutional/agnn_conv.py
@@ -58,9 +58,9 @@ def build(self, input_shape):
         self.built = True
 
     def call(self, inputs, **kwargs):
-        X, A, E = self.get_inputs(inputs)
-        X_norm = K.l2_normalize(X, axis=-1)
-        output = self.propagate(X, A, E, X_norm=X_norm)
+        x, a, _ = self.get_inputs(inputs)
+        X_norm = K.l2_normalize(x, axis=-1)
+        output = self.propagate(x, a, X_norm=X_norm)
         output = self.activation(output)
 
         return output
diff --git a/spektral/layers/convolutional/gated_graph_conv.py b/spektral/layers/convolutional/gated_graph_conv.py
index 6cc5cc24..de53d334 100644
--- a/spektral/layers/convolutional/gated_graph_conv.py
+++ b/spektral/layers/convolutional/gated_graph_conv.py
@@ -101,14 +101,14 @@ def build(self, input_shape):
         self.built = True
 
     def call(self, inputs):
-        X, A, E = self.get_inputs(inputs)
-        F = K.int_shape(X)[-1]
+        x, a, _ = self.get_inputs(inputs)
+        F = K.int_shape(x)[-1]
 
         to_pad = self.channels - F
-        output = tf.pad(X, [[0, 0], [0, to_pad]])
+        output = tf.pad(x, [[0, 0], [0, to_pad]])
         for i in range(self.n_layers):
             m = tf.matmul(output, self.kernel[i])
-            m = self.propagate(m, A)
+            m = self.propagate(m, a)
             output = self.rnn(m, [output])[0]
 
         output = self.activation(output)
diff --git a/spektral/layers/convolutional/gin_conv.py b/spektral/layers/convolutional/gin_conv.py
index 78d2f9cf..1b9c9633 100644
--- a/spektral/layers/convolutional/gin_conv.py
+++ b/spektral/layers/convolutional/gin_conv.py
@@ -109,8 +109,8 @@ def build(self, input_shape):
         self.built = True
 
     def call(self, inputs):
-        X, A, E = self.get_inputs(inputs)
-        output = self.mlp((1.0 + self.eps) * X + self.propagate(X, A, E))
+        x, a, _ = self.get_inputs(inputs)
+        output = self.mlp((1.0 + self.eps) * x + self.propagate(x, a))
 
         return output
 
diff --git a/spektral/layers/convolutional/tag_conv.py b/spektral/layers/convolutional/tag_conv.py
index 9c121b23..3a2aaf98 100644
--- a/spektral/layers/convolutional/tag_conv.py
+++ b/spektral/layers/convolutional/tag_conv.py
@@ -87,12 +87,12 @@ def build(self, input_shape):
         self.built = True
 
     def call(self, inputs, **kwargs):
-        X, A, E = self.get_inputs(inputs)
-        edge_weight = A.values
+        x, a, _ = self.get_inputs(inputs)
+        edge_weight = a.values
 
-        output = [X]
+        output = [x]
         for k in range(self.K):
-            output.append(self.propagate(X, A, E, edge_weight=edge_weight))
+            output.append(self.propagate(x, a, edge_weight=edge_weight))
         output = K.concatenate(output)
 
         return self.linear(output)

From 71d596ccc10fc381b9780f6785fbda275dff8f84 Mon Sep 17 00:00:00 2001
From: Daniele Grattarola <daniele.grattarola@gmail.com>
Date: Sat, 14 Nov 2020 16:32:04 +0100
Subject: [PATCH 33/57] Clean up utils Remove chem module Remove
 utils.conversion Add node-level labels support to DisjointLoader Add docs for
 data and transforms modules Reorganize docs Bump version to 1.0.0 in
 preparation to release

---
 README.md                                     |   5 +-
 docs/autogen.py                               | 130 +++-----
 docs/mkdocs.yml                               |  10 +-
 docs/templates/chem.md                        |   7 -
 docs/templates/data-representation.md         | 142 ++++++++
 docs/templates/data.md                        | 143 +-------
 docs/templates/datasets.md                    |  27 --
 docs/templates/{utils/data.md => loaders.md}  |   2 +-
 .../{utils/conversion.md => transforms.md}    |   2 +-
 setup.py                                      |   2 +-
 spektral/__init__.py                          |   2 +-
 spektral/chem.py                              | 313 ------------------
 spektral/data/__init__.py                     |   3 +-
 spektral/data/dataset.py                      |   9 +-
 spektral/data/graph.py                        |  10 +
 spektral/data/loaders.py                      | 285 +++++++++++++---
 spektral/data/utils.py                        |   6 +-
 spektral/datasets/__init__.py                 |   2 +-
 spektral/transforms/__init__.py               |   3 +
 spektral/transforms/adj_to_sp_tensor.py       |   3 +
 spektral/transforms/constant.py               |  10 +
 spektral/transforms/degree.py                 |  21 +-
 spektral/transforms/gcn_filter.py             |  14 +
 spektral/transforms/normalize_adj.py          |  11 +
 spektral/transforms/normalize_one.py          |   9 +
 spektral/transforms/normalize_sphere.py       |  14 +-
 spektral/transforms/one_hot.py                |  13 +
 spektral/utils/__init__.py                    |   1 -
 spektral/utils/conversion.py                  | 203 ------------
 spektral/utils/convolution.py                 |  23 ++
 spektral/utils/io.py                          |  34 +-
 spektral/utils/misc.py                        | 114 +------
 tests/test_data/test_loaders.py               |  47 ++-
 33 files changed, 665 insertions(+), 955 deletions(-)
 delete mode 100644 docs/templates/chem.md
 create mode 100644 docs/templates/data-representation.md
 rename docs/templates/{utils/data.md => loaders.md} (64%)
 rename docs/templates/{utils/conversion.md => transforms.md} (58%)
 delete mode 100644 spektral/chem.py
 delete mode 100644 spektral/utils/conversion.py

diff --git a/README.md b/README.md
index ddf6696a..ac5381eb 100644
--- a/README.md
+++ b/README.md
@@ -42,9 +42,6 @@ You can also cite the paper introducing Spektral: [Graph Neural Networks in Tens
 Spektral is compatible with Python 3.5+, and is tested on Ubuntu 16.04+ and MacOS. 
 Other Linux distros should work as well, but Windows is not supported for now. 
 
-Some optional features of Spektral depend on [RDKit](http://www.rdkit.org/docs/index.html), 
-a library for cheminformatics and molecule manipulation (available through Anaconda).
-
 The simplest way to install Spektral is from PyPi: 
 
 ```bash
@@ -76,7 +73,7 @@ git checkout tf1
 python setup.py install  # Or 'pip install .'
 ```
 
-In the future, the TF1-compatible version of Spektral (<0.2) will receive bug fixes, but all new features will only support TensorFlow 2.   
+In the future, the TF1-compatible version of Spektral (<0.3) will receive bug fixes, but all new features will only support TensorFlow 2.   
 
 ## Contributing
 Spektral is an open source project available [on Github](https://github.com/danielegrattarola/spektral), and contributions of all types are welcome. 
diff --git a/docs/autogen.py b/docs/autogen.py
index f28069af..95163beb 100644
--- a/docs/autogen.py
+++ b/docs/autogen.py
@@ -8,10 +8,10 @@
 import shutil
 import sys
 
-import spektral.data.utils
-from spektral import chem
+from spektral import data
 from spektral import datasets
 from spektral import layers
+from spektral import transforms
 from spektral import utils
 
 try:
@@ -81,73 +81,79 @@
             layers.Disjoint2Batch,
         ]
     },
-    # Datasets #################################################################
+    # Data #####################################################################
     {
-        'page': 'datasets.md',
-        'functions': [
-            datasets.citation.load_data
-        ],
-        'methods': [],
-        'classes': []
-    },
-    {
-        'page': 'datasets.md',
-        'functions': [
-            datasets.graphsage.load_data
-        ],
+        'page': 'data.md',
+        'functions': [],
         'methods': [],
-        'classes': []
+        'classes': [
+            data.Graph
+        ]
     },
     {
-        'page': 'datasets.md',
+        'page': 'data.md',
         'functions': [],
         'methods': [],
         'classes': [
-            datasets.tudataset.TUDataset
+            data.Dataset
         ]
     },
     {
-        'page': 'datasets.md',
+        'page': 'data.md',
         'functions': [
-            datasets.ogb.graph_to_numpy,
-            datasets.ogb.dataset_to_numpy
+            data.utils.to_disjoint,
+            data.utils.to_batch,
+            data.utils.to_tf_signature
         ],
         'methods': [],
         'classes': []
     },
+    # Loaders ##################################################################
     {
-        'page': 'datasets.md',
+        'page': 'loaders.md',
         'functions': [],
         'methods': [],
         'classes': [
-            datasets.qm9.QM9
+            data.Loader,
+            data.SingleLoader,
+            data.DisjointLoader,
+            data.BatchLoader,
+            data.PackedBatchLoader
         ]
     },
+    # Datasets #################################################################
     {
         'page': 'datasets.md',
-        'functions': [
-            datasets.mnist.load_data
-        ],
+        'functions': [],
         'methods': [],
-        'classes': []
+        'classes': [
+            datasets.Citation,
+            datasets.GraphSage,
+            datasets.PPI,
+            datasets.Reddit,
+            datasets.OGB,
+            datasets.QM9,
+            datasets.TUDataset,
+        ]
     },
-    # Utils ####################################################################
+    # Transforms ###############################################################
     {
-        'page': 'utils/data.md',
-        'functions': [
-            spektral.data.utils.to_disjoint,
-            spektral.data.utils.to_batch,
-            spektral.data.utils.batch_generator
-        ],
+        'page': 'transforms.md',
+        'functions': [],
+        'methods': [],
         'classes': [
-            spektral.data.Graph,
-            spektral.data.Dataset,
-            spektral.data.Loader,
-            spektral.data.DisjointLoader,
-            spektral.data.BatchLoader,
-            spektral.data.PackedBatchLoader
+            transforms.AdjToSpTensor,
+            transforms.Constant,
+            transforms.Degree,
+            transforms.GCNFilter,
+            transforms.LayerPreprocess,
+            transforms.NormalizeAdj,
+            transforms.NormalizeOne,
+            transforms.NormalizeSphere,
+            transforms.OneHotLabels
         ]
     },
+    # Utils ####################################################################
     {
         'page': 'utils/convolution.md',
         'functions': [
@@ -157,6 +163,7 @@
             utils.convolution.laplacian,
             utils.convolution.normalized_laplacian,
             utils.convolution.rescale_laplacian,
+            utils.convolution.add_self_loops,
             utils.convolution.gcn_filter,
             utils.convolution.chebyshev_polynomial,
             utils.convolution.chebyshev_filter
@@ -168,46 +175,9 @@
         'page': 'utils/misc.md',
         'functions': [
             utils.misc.pad_jagged_array,
-            utils.misc.add_eye,
-            utils.misc.sub_eye,
-            utils.misc.add_eye_batch,
-            utils.misc.sub_eye_batch,
-            utils.misc.add_eye_jagged,
-            utils.misc.sub_eye_jagged,
-        ],
-        'methods': [],
-        'classes': []
-    },
-    {
-        'page': 'utils/conversion.md',
-        'functions': [
-            utils.conversion.nx_to_adj,
-            utils.conversion.nx_to_node_features,
-            utils.conversion.nx_to_edge_features,
-            utils.conversion.nx_to_numpy,
-            utils.conversion.numpy_to_nx
-        ],
-        'methods': [],
-        'classes': []
-    },
-    # Chem #####################################################################
-    {
-        'page': 'chem.md',
-        'functions': [
-            chem.numpy_to_rdkit,
-            chem.numpy_to_smiles,
-            chem.rdkit_to_smiles,
-            chem.sdf_to_nx,
-            chem.nx_to_sdf,
-            chem.validate_rdkit,
-            chem.get_atomic_symbol,
-            chem.get_atomic_num,
-            chem.valid_score,
-            chem.novel_score,
-            chem.unique_score,
-            chem.enable_rdkit_log,
-            chem.plot_rdkit,
-            chem.plot_rdkit_svg_grid
+            utils.misc.one_hot,
+            utils.misc.label_to_one_hot,
+            utils.misc.flatten_list
         ],
         'methods': [],
         'classes': []
diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml
index ff2af231..352bafa3 100644
--- a/docs/mkdocs.yml
+++ b/docs/mkdocs.yml
@@ -30,19 +30,19 @@ nav:
 - Home: index.md
 - Tutorials:
     - Getting started: getting-started.md
-    - Data representation: data.md
+    - Data representation: data-representation.md
     - Examples: examples.md
 - Layers:
     - Convolutional Layers: layers/convolution.md
     - Pooling Layers: layers/pooling.md
     - Base Layers: layers/base.md
 - Data:
+    - Containers: data.md
     - Datasets: datasets.md
-    - Data utils: utils/data.md
+    - Loaders: loaders.md
+    - Transforms: transforms.md
 - Utils:
     - Convolution: utils/convolution.md
     - Miscellaneous: utils/misc.md
-    - Conversion: utils/conversion.md
-- Chemistry: chem.md
 - Other:
-  - About: about.md
+    - About: about.md
diff --git a/docs/templates/chem.md b/docs/templates/chem.md
deleted file mode 100644
index dabc1dd7..00000000
--- a/docs/templates/chem.md
+++ /dev/null
@@ -1,7 +0,0 @@
-# Chemistry
-
-This module provides some functions to work with molecules, and requires
-the [RDKit](http://www.rdkit.org/docs/index.html) library to be
-installed on the system.
-
-{{autogenerated}}
\ No newline at end of file
diff --git a/docs/templates/data-representation.md b/docs/templates/data-representation.md
new file mode 100644
index 00000000..fbd9a69b
--- /dev/null
+++ b/docs/templates/data-representation.md
@@ -0,0 +1,142 @@
+# Representing graphs
+
+In Spektral, graphs are represented as matrices:
+
+- `A` is the adjacency matrix of shape `(N, N)`, where `N` is the number of nodes. `A` is a binary matrix where `A[i, j] = 1` if there is an edge between nodes `i` and `j`, and `0` otherwise. 
+- `X` is the node attributes matrix of shape `(N, F)`, where `F` is the size of the node attributes. 
+
+Sometimes, we can also have edge attributes of size `S`, which we store in a matrix `E` of shape `(n_edges, S)` where each row is associated to a non-zero entry of `A`: assuming that `A` is a Scipy sparse matrix, we have that `E[i]` is the attribute associated to `A.data[i]`.
+
+## Modes
+
+Spektral supports four different ways of representing graphs or batches of graphs, which we refer to as **data modes**.
+
+- In **single mode**, we have one graph with its adjacency matrix and attributes;
+- **Disjoint mode** is a special case of single mode, where the graph is the disjoint union of a set of graphs;
+- In **batch mode**, a set of graphs is represented by stacking their adjacency and node attributes matrices in higher order tensors of shape `(batch, N, ...)`;
+- In **mixed mode**, we have a single adjacency matrix shared by a set of graphs; the adjacency matrix will be in single mode, but the node attributes will be in batch mode. 
+
+The difference between the four data modes can be easily seen in how `A`, `X`, and `E` have different shapes in each case:
+
+|Mode    | `A.shape`     | `X.shape`     | `E.shape`        |
+|:------:|:-------------:|:-------------:|:----------------:|
+|Single  |`(N, N)`       |`(N, F)`       |`(n_edges, S)`    |
+|Disjoint|`(N, N)`       |`(N, F)`       |`(n_edges, S)`    |
+|Batch   |`(batch, N, N)`|`(batch, N, F)`|`(batch, N, N, S)`|
+|Mixed   |`(N, N)`       |`(batch, N, F)`| N/A              |
+
+
+
+## Single mode
+
+<img src="https://danielegrattarola.github.io/spektral/img/single_mode.svg" width="50%"/>
+
+In **single mode** the data describes a single graph where:
+
+- `A` is a sparse matrix of shape `(N, N)`;
+- `X` is a matrix of shape `(N, F)`;
+
+When edge attributes are present, we represent them as a matrix `E` of shape `(n_edges, S)` so that there is a correspondence between `E[i]` and `A.data[i]`.
+
+Three very popular datasets in this setting are the citation networks: Cora, Citeseer, and Pubmed. To load a citation network, you can use the built-in loader:
+
+```py
+>>> from spektral.datasets import citation
+>>> A, X, _, _, _, _ = citation.load_data('cora')
+Loading cora dataset
+>>> A.shape
+(2708, 2708)
+>>> X.shape
+(2708, 1433)
+```
+
+## Disjoint mode
+
+<img src="https://danielegrattarola.github.io/spektral/img/disjoint_mode.svg" width="50%"/>
+
+**Disjoint mode** is a smart way of representing a set of graphs as a single graph.
+In particular, the disjoint union of a batch is a graph where 
+
+- `A` is a sparse block diagonal matrix, where each block is the adjacency matrix `A_i` of the i-th graph;
+- `X` is obtained by stacking the node attributes matrices of the graphs.
+
+When edge attributes are present, we represent them as a matrix `E` of shape `(n_edges, S)` so that there is a correspondence between `E[i]` and `A.data[i]`.
+
+In order to keep track of different graphs in the disjoint union, we use an additional array of integers `I` that identifies which nodes belong to the same graph.  
+For convolutional layers, disjoint mode is indistinguishable from single mode because it is not possible to exchange messages between the components of the graph, so `I` is not needed to compute the output.  
+Pooling layers, on the other hand, require `I` to know which nodes can be pooled together. 
+Hierarchical pooling layers will return a reduced version of `I` along with the reduced graphs. Global pooling layers will consume `I` and reduce the graphs to single vectors. 
+
+Utilities for creating the disjoint union of a list of graphs are provided in `spektral.utils.data`:
+
+```py
+>>> from spektral.utils.data import to_disjoint
+>>> A_list = [np.ones((2, 2)), np.ones((3, 3))]  # One graph has 2 nodes, the other has 3
+>>> X_list = [np.random.randn(2, 4), np.random.randn(3, 4)]  # F = 4
+>>> X, A, I = to_disjoint(X_list, A_list)
+>>> X.shape
+(5, 4)
+>>> A.shape
+(5, 5)
+>>> A.toarray()
+array([[1., 1., 0., 0., 0.],
+       [1., 1., 0., 0., 0.],
+       [0., 0., 1., 1., 1.],
+       [0., 0., 1., 1., 1.],
+       [0., 0., 1., 1., 1.]])
+>>> I
+array([0, 0, 1, 1, 1])
+```
+
+## Batch mode
+
+<img src="https://danielegrattarola.github.io/spektral/img/batch_mode.svg" width="50%"/>
+
+In **batch mode**, graphs have the same number of nodes and are stacked in tensors of shape `(batch, N, ...)`. 
+Due to the general lack of support for sparse higher-order tensors both in Scipy and TensorFlow, `A` and `X` will be dense tensors.
+
+In this case, edge attributes must also be reshaped and made dense, so that `E` has shape `(batch, N, N, S)` (the attributes of non-existing edges are usually all zeros).
+
+Note that if the graphs have variable number of nodes, the matrices must be zero-padded so that they have the same `N`.
+If you don't want to zero-pad the graphs or work with dense inputs, it is better to work in [disjoint mode](https://danielegrattarola.github.io/spektral/data/#disjoint-mode) instead.
+
+The advantage of batch mode is that it is more intuitive and it allows to use the training loop of `tf.keras` without any modifications. Also, some pooling layers like `DiffPool` and `MinCutPool` will only work in batch mode. 
+
+For example, the QM9 dataset of small molecules will be loaded in batch mode by default:
+
+```py
+>>> from spektral.datasets import qm9
+>>> A, X, E, y = qm9.load_data()
+Loading QM9 dataset.
+Reading SDF
+>>> A.shape
+(133885, 9, 9)
+>>> X.shape
+(133885, 9, 6)
+>>> E.shape
+(133885, 9, 9, 5)
+```
+
+## Mixed mode
+
+<img src="https://danielegrattarola.github.io/spektral/img/mixed_mode.svg" width="50%"/>
+
+In **mixed mode** we consider a single adjacency matrix that acts as the support for different node attributes (also sometimes called "signals").
+
+In this case we have that: 
+
+- `A` is a sparse matrix of shape `(N, N)`;
+- `X` is a tensor in batch mode, of shape `(batch, N, F)`;
+
+Currently, there are no layers in Spektral that support mixed mode and edge attributes. 
+
+An example of a mixed mode dataset is the MNIST random grid ([Defferrard et al., 2016](https://arxiv.org/abs/1606.09375)):
+
+```py
+>>> from spektral.datasets import mnist
+>>> X_tr, y_tr, X_va, y_va, X_te, y_te, A = mnist.load_data()
+>>> A.shape
+(784, 784)
+>>> X_tr.shape
+(50000, 784, 1)
+```
diff --git a/docs/templates/data.md b/docs/templates/data.md
index fbd9a69b..80a0d4f0 100644
--- a/docs/templates/data.md
+++ b/docs/templates/data.md
@@ -1,142 +1,13 @@
-# Representing graphs
+# Data
 
-In Spektral, graphs are represented as matrices:
+{{autogenerated}}
 
-- `A` is the adjacency matrix of shape `(N, N)`, where `N` is the number of nodes. `A` is a binary matrix where `A[i, j] = 1` if there is an edge between nodes `i` and `j`, and `0` otherwise. 
-- `X` is the node attributes matrix of shape `(N, F)`, where `F` is the size of the node attributes. 
+---
 
-Sometimes, we can also have edge attributes of size `S`, which we store in a matrix `E` of shape `(n_edges, S)` where each row is associated to a non-zero entry of `A`: assuming that `A` is a Scipy sparse matrix, we have that `E[i]` is the attribute associated to `A.data[i]`.
+{{autogenerated}}
 
-## Modes
+---
 
-Spektral supports four different ways of representing graphs or batches of graphs, which we refer to as **data modes**.
+## Data utils
 
-- In **single mode**, we have one graph with its adjacency matrix and attributes;
-- **Disjoint mode** is a special case of single mode, where the graph is the disjoint union of a set of graphs;
-- In **batch mode**, a set of graphs is represented by stacking their adjacency and node attributes matrices in higher order tensors of shape `(batch, N, ...)`;
-- In **mixed mode**, we have a single adjacency matrix shared by a set of graphs; the adjacency matrix will be in single mode, but the node attributes will be in batch mode. 
-
-The difference between the four data modes can be easily seen in how `A`, `X`, and `E` have different shapes in each case:
-
-|Mode    | `A.shape`     | `X.shape`     | `E.shape`        |
-|:------:|:-------------:|:-------------:|:----------------:|
-|Single  |`(N, N)`       |`(N, F)`       |`(n_edges, S)`    |
-|Disjoint|`(N, N)`       |`(N, F)`       |`(n_edges, S)`    |
-|Batch   |`(batch, N, N)`|`(batch, N, F)`|`(batch, N, N, S)`|
-|Mixed   |`(N, N)`       |`(batch, N, F)`| N/A              |
-
-
-
-## Single mode
-
-<img src="https://danielegrattarola.github.io/spektral/img/single_mode.svg" width="50%"/>
-
-In **single mode** the data describes a single graph where:
-
-- `A` is a sparse matrix of shape `(N, N)`;
-- `X` is a matrix of shape `(N, F)`;
-
-When edge attributes are present, we represent them as a matrix `E` of shape `(n_edges, S)` so that there is a correspondence between `E[i]` and `A.data[i]`.
-
-Three very popular datasets in this setting are the citation networks: Cora, Citeseer, and Pubmed. To load a citation network, you can use the built-in loader:
-
-```py
->>> from spektral.datasets import citation
->>> A, X, _, _, _, _ = citation.load_data('cora')
-Loading cora dataset
->>> A.shape
-(2708, 2708)
->>> X.shape
-(2708, 1433)
-```
-
-## Disjoint mode
-
-<img src="https://danielegrattarola.github.io/spektral/img/disjoint_mode.svg" width="50%"/>
-
-**Disjoint mode** is a smart way of representing a set of graphs as a single graph.
-In particular, the disjoint union of a batch is a graph where 
-
-- `A` is a sparse block diagonal matrix, where each block is the adjacency matrix `A_i` of the i-th graph;
-- `X` is obtained by stacking the node attributes matrices of the graphs.
-
-When edge attributes are present, we represent them as a matrix `E` of shape `(n_edges, S)` so that there is a correspondence between `E[i]` and `A.data[i]`.
-
-In order to keep track of different graphs in the disjoint union, we use an additional array of integers `I` that identifies which nodes belong to the same graph.  
-For convolutional layers, disjoint mode is indistinguishable from single mode because it is not possible to exchange messages between the components of the graph, so `I` is not needed to compute the output.  
-Pooling layers, on the other hand, require `I` to know which nodes can be pooled together. 
-Hierarchical pooling layers will return a reduced version of `I` along with the reduced graphs. Global pooling layers will consume `I` and reduce the graphs to single vectors. 
-
-Utilities for creating the disjoint union of a list of graphs are provided in `spektral.utils.data`:
-
-```py
->>> from spektral.utils.data import to_disjoint
->>> A_list = [np.ones((2, 2)), np.ones((3, 3))]  # One graph has 2 nodes, the other has 3
->>> X_list = [np.random.randn(2, 4), np.random.randn(3, 4)]  # F = 4
->>> X, A, I = to_disjoint(X_list, A_list)
->>> X.shape
-(5, 4)
->>> A.shape
-(5, 5)
->>> A.toarray()
-array([[1., 1., 0., 0., 0.],
-       [1., 1., 0., 0., 0.],
-       [0., 0., 1., 1., 1.],
-       [0., 0., 1., 1., 1.],
-       [0., 0., 1., 1., 1.]])
->>> I
-array([0, 0, 1, 1, 1])
-```
-
-## Batch mode
-
-<img src="https://danielegrattarola.github.io/spektral/img/batch_mode.svg" width="50%"/>
-
-In **batch mode**, graphs have the same number of nodes and are stacked in tensors of shape `(batch, N, ...)`. 
-Due to the general lack of support for sparse higher-order tensors both in Scipy and TensorFlow, `A` and `X` will be dense tensors.
-
-In this case, edge attributes must also be reshaped and made dense, so that `E` has shape `(batch, N, N, S)` (the attributes of non-existing edges are usually all zeros).
-
-Note that if the graphs have variable number of nodes, the matrices must be zero-padded so that they have the same `N`.
-If you don't want to zero-pad the graphs or work with dense inputs, it is better to work in [disjoint mode](https://danielegrattarola.github.io/spektral/data/#disjoint-mode) instead.
-
-The advantage of batch mode is that it is more intuitive and it allows to use the training loop of `tf.keras` without any modifications. Also, some pooling layers like `DiffPool` and `MinCutPool` will only work in batch mode. 
-
-For example, the QM9 dataset of small molecules will be loaded in batch mode by default:
-
-```py
->>> from spektral.datasets import qm9
->>> A, X, E, y = qm9.load_data()
-Loading QM9 dataset.
-Reading SDF
->>> A.shape
-(133885, 9, 9)
->>> X.shape
-(133885, 9, 6)
->>> E.shape
-(133885, 9, 9, 5)
-```
-
-## Mixed mode
-
-<img src="https://danielegrattarola.github.io/spektral/img/mixed_mode.svg" width="50%"/>
-
-In **mixed mode** we consider a single adjacency matrix that acts as the support for different node attributes (also sometimes called "signals").
-
-In this case we have that: 
-
-- `A` is a sparse matrix of shape `(N, N)`;
-- `X` is a tensor in batch mode, of shape `(batch, N, F)`;
-
-Currently, there are no layers in Spektral that support mixed mode and edge attributes. 
-
-An example of a mixed mode dataset is the MNIST random grid ([Defferrard et al., 2016](https://arxiv.org/abs/1606.09375)):
-
-```py
->>> from spektral.datasets import mnist
->>> X_tr, y_tr, X_va, y_va, X_te, y_te, A = mnist.load_data()
->>> A.shape
-(784, 784)
->>> X_tr.shape
-(50000, 784, 1)
-```
+{{autogenerated}}
\ No newline at end of file
diff --git a/docs/templates/datasets.md b/docs/templates/datasets.md
index e130ec13..c6615e96 100644
--- a/docs/templates/datasets.md
+++ b/docs/templates/datasets.md
@@ -1,30 +1,3 @@
 # Datasets
 
-## Citation networks
-
-{{autogenerated}}
-
-## GraphSAGE datasets
-
-{{autogenerated}}
-
-
-## TU Dortmund Benchmark Datasets for Graph Kernels
-
-{{autogenerated}}
-
-## Open Graph Benchmark (OGB)
-
-{{autogenerated}}
-
-## QM9 Small Molecules
-
-{{autogenerated}}
-
-## MNIST KNN Grid
-
-{{autogenerated}}
-
-## Delaunay Triangulations
-
 {{autogenerated}}
diff --git a/docs/templates/utils/data.md b/docs/templates/loaders.md
similarity index 64%
rename from docs/templates/utils/data.md
rename to docs/templates/loaders.md
index 9daca2a8..63a865ee 100644
--- a/docs/templates/utils/data.md
+++ b/docs/templates/loaders.md
@@ -1,3 +1,3 @@
-# Data
+# Loaders
 
 {{autogenerated}}
\ No newline at end of file
diff --git a/docs/templates/utils/conversion.md b/docs/templates/transforms.md
similarity index 58%
rename from docs/templates/utils/conversion.md
rename to docs/templates/transforms.md
index fea7c454..6e8dbd23 100644
--- a/docs/templates/utils/conversion.md
+++ b/docs/templates/transforms.md
@@ -1,3 +1,3 @@
-# Conversion
+# Transforms
 
 {{autogenerated}}
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 662b3f02..2e92ae2b 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@
 
 setup(
     name='spektral',
-    version='0.6.2',
+    version='1.0.0',
     packages=find_packages(),
     install_requires=['tensorflow>=2.1.0',
                       'networkx',
diff --git a/spektral/__init__.py b/spektral/__init__.py
index 29379d81..745c179a 100644
--- a/spektral/__init__.py
+++ b/spektral/__init__.py
@@ -2,4 +2,4 @@
 from . import datasets
 from . import utils
 
-__version__ = '0.6.2'
+__version__ = '1.0.0'
diff --git a/spektral/chem.py b/spektral/chem.py
deleted file mode 100644
index 39694451..00000000
--- a/spektral/chem.py
+++ /dev/null
@@ -1,313 +0,0 @@
-import networkx as nx
-import numpy as np
-try:
-    from rdkit import Chem as rdc
-    from rdkit.Chem import Draw
-    from rdkit import rdBase as rdb
-
-    rdb.DisableLog('rdApp.error')  # RDKit logging is disabled by default
-    Draw.DrawingOptions.dblBondOffset = .1
-    BOND_MAP = {0: rdc.rdchem.BondType.ZERO,
-                1: rdc.rdchem.BondType.SINGLE,
-                2: rdc.rdchem.BondType.DOUBLE,
-                3: rdc.rdchem.BondType.TRIPLE,
-                4: rdc.rdchem.BondType.AROMATIC}
-except ImportError:
-    rdc = None
-    rdb = None
-
-NUM_TO_SYMBOL = {1: 'H', 2: 'He', 3: 'Li', 4: 'Be', 5: 'B', 6: 'C', 7: 'N',
-                 8: 'O', 9: 'F', 10: 'Ne', 11: 'Na', 12: 'Mg', 13: 'Al',
-                 14: 'Si', 15: 'P', 16: 'S', 17: 'Cl', 18: 'Ar', 19: 'K',
-                 20: 'Ca', 21: 'Sc', 22: 'Ti', 23: 'V', 24: 'Cr', 25: 'Mn',
-                 26: 'Fe', 27: 'Co', 28: 'Ni', 29: 'Cu', 30: 'Zn', 31: 'Ga',
-                 32: 'Ge', 33: 'As', 34: 'Se', 35: 'Br', 36: 'Kr', 37: 'Rb',
-                 38: 'Sr', 39: 'Y', 40: 'Zr', 41: 'Nb', 42: 'Mo', 43: 'Tc',
-                 44: 'Ru', 45: 'Rh', 46: 'Pd', 47: 'Ag', 48: 'Cd', 49: 'In',
-                 50: 'Sn', 51: 'Sb', 52: 'Te', 53: 'I', 54: 'Xe', 55: 'Cs',
-                 56: 'Ba', 57: 'La', 58: 'Ce', 59: 'Pr', 60: 'Nd', 61: 'Pm',
-                 62: 'Sm', 63: 'Eu', 64: 'Gd', 65: 'Tb', 66: 'Dy', 67: 'Ho',
-                 68: 'Er', 69: 'Tm', 70: 'Yb', 71: 'Lu', 72: 'Hf', 73: 'Ta',
-                 74: 'W', 75: 'Re', 76: 'Os', 77: 'Ir', 78: 'Pt', 79: 'Au',
-                 80: 'Hg', 81: 'Tl', 82: 'Pb', 83: 'Bi', 84: 'Po', 85: 'At',
-                 86: 'Rn', 87: 'Fr', 88: 'Ra', 89: 'Ac', 90: 'Th', 91: 'Pa',
-                 92: 'U', 93: 'Np', 94: 'Pu', 95: 'Am', 96: 'Cm', 97: 'Bk',
-                 98: 'Cf', 99: 'Es', 100: 'Fm', 101: 'Md', 102: 'No', 103: 'Lr',
-                 104: 'Rf', 105: 'Db', 106: 'Sg', 107: 'Bh', 108: 'Hs',
-                 109: 'Mt', 110: 'Ds', 111: 'Rg', 112: 'Cn', 113: 'Nh',
-                 114: 'Fl', 115: 'Mc', 116: 'Lv', 117: 'Ts', 118: 'Og'}
-SYMBOL_TO_NUM = {v: k for k, v in NUM_TO_SYMBOL.items()}
-
-
-def numpy_to_rdkit(adj, nf, ef, sanitize=False):
-    """
-    Converts a molecule from numpy to RDKit format.
-    :param adj: binary numpy array of shape (N, N) 
-    :param nf: numpy array of shape (N, F)
-    :param ef: numpy array of shape (N, N, S)
-    :param sanitize: whether to sanitize the molecule after conversion
-    :return: an RDKit molecule
-    """
-    if rdc is None:
-        raise ImportError('`numpy_to_rdkit` requires RDKit.')
-    mol = rdc.RWMol()
-    for nf_ in nf:
-        atomic_num = int(nf_)
-        if atomic_num > 0:
-            mol.AddAtom(rdc.Atom(atomic_num))
-
-    for i, j in zip(*np.triu_indices(adj.shape[-1])):
-        if i != j and adj[i, j] == adj[j, i] == 1 and not mol.GetBondBetweenAtoms(int(i), int(j)):
-            bond_type_1 = BOND_MAP[int(ef[i, j, 0])]
-            bond_type_2 = BOND_MAP[int(ef[j, i, 0])]
-            if bond_type_1 == bond_type_2:
-                mol.AddBond(int(i), int(j), bond_type_1)
-
-    mol = mol.GetMol()
-    if sanitize:
-        rdc.SanitizeMol(mol)
-    return mol
-
-
-def numpy_to_smiles(adj, nf, ef):
-    """
-    Converts a molecule from numpy to SMILES format.
-    :param adj: binary numpy array of shape (N, N) 
-    :param nf: numpy array of shape (N, F)
-    :param ef: numpy array of shape (N, N, S) 
-    :return: the SMILES string of the molecule
-    """
-    if rdc is None:
-        raise ImportError('`numpy_to_smiles` requires RDkit.')
-    mol = numpy_to_rdkit(adj, nf, ef)
-    return rdkit_to_smiles(mol)
-
-
-def rdkit_to_smiles(mol):
-    """
-    Returns the SMILES string representing an RDKit molecule.
-    :param mol: an RDKit molecule
-    :return: the SMILES string of the molecule 
-    """
-    if rdc is None:
-        raise ImportError('`rdkit_to_smiles` requires RDkit.')
-    return rdc.MolToSmiles(mol)
-
-
-def sdf_to_nx(sdf, keep_hydrogen=False):
-    """
-    Converts molecules in SDF format to networkx Graphs.
-    :param sdf: a list of molecules (or individual molecule) in SDF format.
-    :param keep_hydrogen: whether to include hydrogen in the representation.
-    :return: list of nx.Graphs.
-    """
-    if not isinstance(sdf, list):
-        sdf = [sdf]
-
-    output = []
-    for sdf_ in sdf:
-        g = nx.Graph()
-
-        for atom in sdf_['atoms']:
-            if atom['atomic_num'] > 1 or keep_hydrogen:
-                g.add_node(atom['index'], **atom)
-        for bond in sdf_['bonds']:
-            start_atom_num = sdf_['atoms'][bond['start_atom']]['atomic_num']
-            end_atom_num = sdf_['atoms'][bond['end_atom']]['atomic_num']
-            if (start_atom_num > 1 and end_atom_num > 1) or keep_hydrogen:
-                g.add_edge(bond['start_atom'], bond['end_atom'], **bond)
-        output.append(g)
-
-    if len(output) == 1:
-        return output[0]
-    else:
-        return output
-
-
-def nx_to_sdf(graphs):
-    """
-    Converts a list of nx.Graphs to the internal SDF format.
-    :param graphs: list of nx.Graphs.
-    :return: list of molecules in the internal SDF format.
-    """
-    if isinstance(graphs, nx.Graph):
-        graphs = [graphs]
-    output = []
-    for g in graphs:
-        sdf = {'atoms': [v for k, v in g.nodes.items()],
-               'bonds': [v for k, v in g.edges.items()],
-               'comment': '',
-               'data': [''],
-               'details': '',
-               'n_atoms': -1,
-               'n_bonds': -1,
-               'name': '',
-               'properties': []}
-        output.append(sdf)
-    return output
-
-
-def validate_rdkit_mol(mol):
-    """
-    Sanitizes an RDKit molecules and returns True if the molecule is chemically
-    valid.
-    :param mol: an RDKit molecule 
-    :return: True if the molecule is chemically valid, False otherwise
-    """
-    if rdc is None:
-        raise ImportError('`validate_rdkit_mol` requires RDkit.')
-    if len(rdc.GetMolFrags(mol)) > 1:
-        return False
-    try:
-        rdc.SanitizeMol(mol)
-        return True
-    except ValueError:
-        return False
-
-
-def validate_rdkit(mol):
-    """
-    Validates RDKit molecules (single or in a list). 
-    :param mol: an RDKit molecule or list/np.array thereof
-    :return: boolean array, True if the molecules are chemically valid, False 
-    otherwise
-    """
-    if rdc is None:
-        raise ImportError('`validate_rdkit` requires RDkit.')
-    if isinstance(mol, list) or isinstance(mol, np.ndarray):
-        return np.array([validate_rdkit_mol(m) for m in mol])
-    else:
-        return validate_rdkit_mol(mol)
-
-
-def get_atomic_symbol(number):
-    """
-    Given an atomic number (e.g., 6), returns its atomic symbol (e.g., 'C')
-    :param number: int <= 118
-    :return: string, atomic symbol
-    """
-    return NUM_TO_SYMBOL[number]
-
-
-def get_atomic_num(symbol):
-    """
-    Given an atomic symbol (e.g., 'C'), returns its atomic number (e.g., 6)
-    :param symbol: string, atomic symbol
-    :return: int <= 118
-    """
-    return SYMBOL_TO_NUM[symbol.lower().capitalize()]
-
-
-def valid_score(molecules, from_numpy=False):
-    """
-    For a given list of molecules (RDKit or numpy format), returns a boolean 
-    array representing the validity of each molecule.
-    :param molecules: list of molecules (RDKit or numpy format)
-    :param from_numpy: whether the molecules are in numpy format
-    :return: boolean array with the validity for each molecule
-    """
-    if rdc is None:
-        raise ImportError('`valid_score` requires RDkit.')
-    valid = []
-    if from_numpy:
-        molecules = [numpy_to_rdkit(adj_p, nf_p, ef_p)
-                     for adj_p, nf_p, ef_p in molecules]
-    for mol_rdk in molecules:
-        valid.append(validate_rdkit_mol(mol_rdk))
-
-    return np.array(valid)
-
-
-def novel_score(molecules, smiles, from_numpy=False):
-    """
-    For a given list of molecules (RDKit or numpy format), returns a boolean 
-    array representing valid and novel molecules with respect to the list
-    of smiles provided (a molecule is novel if its SMILES is not in the list).
-    :param molecules: list of molecules (RDKit or numpy format)
-    :param smiles: list or set of smiles strings against which to check for 
-    novelty
-    :param from_numpy: whether the molecules are in numpy format
-    :return: boolean array with the novelty for each valid molecule
-    """
-    if rdc is None:
-        raise ImportError('`novel_score` requires RDkit.')
-    if from_numpy:
-        molecules = [numpy_to_rdkit(adj_p, nf_p, ef_p)
-                     for adj_p, nf_p, ef_p in molecules]
-    smiles = set(smiles)
-    novel = []
-    for mol in molecules:
-        is_valid = validate_rdkit_mol(mol)
-        is_novel = rdkit_to_smiles(mol) not in smiles
-        novel.append(is_valid and is_novel)
-
-    return np.array(novel)
-
-
-def unique_score(molecules, from_numpy=False):
-    """
-    For a given list of molecules (RDKit or numpy format), returns the fraction
-    of unique and valid molecules w.r.t. to the number of valid molecules.
-    :param molecules: list of molecules (RDKit or numpy format)
-    :param from_numpy: whether the molecules are in numpy format
-    :return: fraction of unique valid molecules w.r.t. to valid molecules
-    """
-    if rdc is None:
-        raise ImportError('`unique_score` requires RDkit.')
-    if from_numpy:
-        molecules = [numpy_to_rdkit(adj_p, nf_p, ef_p)
-                     for adj_p, nf_p, ef_p in molecules]
-    smiles = set()
-    n_valid = 0
-    for mol in molecules:
-        if validate_rdkit_mol(mol):
-            n_valid += 1
-            smiles.add(rdkit_to_smiles(mol))
-
-    return 0 if n_valid == 0 else (len(smiles) / n_valid)
-
-
-def enable_rdkit_log():
-    """
-    Enables RDkit logging.
-    :return:
-    """
-    if rdb is None:
-        raise ImportError('`enable_rdkit_log` requires RDkit.')
-    rdb.EnableLog('rdApp.error')
-
-
-def plot_rdkit(mol, filename=None):
-    """
-    Plots an RDKit molecule in Matplotlib
-    :param mol: an RDKit molecule 
-    :param filename: save the image with the given filename 
-    :return: the image as np.array
-    """
-    if rdc is None:
-        raise ImportError('`draw_rdkit_mol` requires RDkit.')
-    if filename is not None:
-        Draw.MolToFile(mol, filename)
-    img = Draw.MolToImage(mol)
-    return img
-
-
-def plot_rdkit_svg_grid(mols, mols_per_row=5, filename=None, **kwargs):
-    """
-    Plots a grid of RDKit molecules in SVG.
-    :param mols: a list of RDKit molecules
-    :param mols_per_row: size of the grid
-    :param filename: save an image with the given filename
-    :param kwargs: additional arguments for `RDKit.Chem.Draw.MolsToGridImage`
-    :return: the SVG as a string
-    """
-    if rdc is None:
-        raise ImportError('`draw_rdkit_mol` requires RDkit.')
-    svg = Draw.MolsToGridImage(mols, molsPerRow=mols_per_row, useSVG=True, **kwargs)
-    if filename is not None:
-        if not filename.endswith('.svg'):
-            filename += '.svg'
-        with open(filename, 'w') as f:
-            f.write(svg)
-    return svg
-
diff --git a/spektral/data/__init__.py b/spektral/data/__init__.py
index c11741c3..22a975a6 100644
--- a/spektral/data/__init__.py
+++ b/spektral/data/__init__.py
@@ -1,3 +1,4 @@
 from .graph import Graph
 from .dataset import Dataset
-from .loaders import Loader, BatchLoader, DisjointLoader, PackedBatchLoader
+from .loaders import Loader, SingleLoader, DisjointLoader, BatchLoader, \
+    PackedBatchLoader
diff --git a/spektral/data/dataset.py b/spektral/data/dataset.py
index 89bcb1fa..313a681d 100644
--- a/spektral/data/dataset.py
+++ b/spektral/data/dataset.py
@@ -70,7 +70,7 @@ def read(self):
 
     The class also offers a `download()` method that is automatically called
     if the path returned by the `Dataset.path` attribute does not exists.
-    This defaults to `~/.spektral/datasets/ClassName/'.
+    This defaults to `~/.spektral/datasets/ClassName/`.
 
     You can implement this however you like, knowing that `download()` will be
     called before `read()`. You can also override the `path` attribute to
@@ -220,6 +220,13 @@ def signature(self):
         passed to `spektral.data.utils.to_tf_signature(signature)` to compute
         the TensorFlow signature. You can safely ignore this property unless
         you are creating a custom `Loader`.
+
+        A signature consist of the TensorFlow TypeSpec, shape, and dtype of
+        all characteristic matrices of the graphs in the Dataset. This is
+        returned as a dictionary of dictionaries, with keys `x`, `a`, `e`, and
+        `y` for the four main data matrices.
+
+        Each sub-dictionary will have keys `spec`, `shape` and `dtype`.
         """
         signature = {}
         graph = self.graphs[0]  # This is always non-empty
diff --git a/spektral/data/graph.py b/spektral/data/graph.py
index cdf3fcff..10c18af0 100644
--- a/spektral/data/graph.py
+++ b/spektral/data/graph.py
@@ -36,6 +36,7 @@ class Graph:
     matrix should have shape `(N, N)`.
 
     A Graph should always have either the node features or the adjacency matrix.
+    Empty graphs are not supported.
 
     Edge attributes can be stored in a dense format as arrays of shape
     `(N, N, S)` or in a sparse format as arrays of shape `(n_edges, S)`
@@ -78,6 +79,9 @@ def __setitem__(self, key, value):
     def __getitem__(self, key):
         return getattr(self, key, None)
 
+    def __contains__(self, key):
+        return key in self.keys
+
     def __repr__(self):
         return 'Graph(N={}, F={}, S={}, y={})'\
                .format(self.N, self.F, self.S, self.y)
@@ -113,3 +117,9 @@ def n_labels(self):
         else:
             return None
 
+    @property
+    def keys(self):
+        keys = [key for key in self.__dict__.keys()
+                if self[key] is not None
+                and not key.startswith('__')]
+        return keys
diff --git a/spektral/data/loaders.py b/spektral/data/loaders.py
index 2a07b6c8..6dabc9bc 100644
--- a/spektral/data/loaders.py
+++ b/spektral/data/loaders.py
@@ -1,5 +1,3 @@
-import copy
-
 import numpy as np
 import tensorflow as tf
 from scipy import sparse as sp
@@ -14,12 +12,72 @@
 
 class Loader:
     """
+    Parent class for data loaders. The role of a Loader is to iterate over a
+    Dataset and yield batches of graphs to give as input to your Keras Models.
+    This is achieved by having a generator object that produces lists of Graphs,
+    which are then collated together and returned as Tensors (or objects that
+    can be converted to Tensors, like Numpy arrays).
+
+    The core of a Loader is the `collate(batch)` method.
+    This takes as input a list of Graphs and returns a list of Tensors or
+    SparseTensors.
+
+    For instance, if all graphs have the same number of nodes and size of the
+    attributes, a simple collation function can be:
+
+    ```python
+    def collate(self, batch):
+        x = np.array([g.x for g in batch])
+        a = np.array([g.a for g in batch)]
+        return x, a
+    ```
+
+    Since all data matrices (node attributes, adjacency matrices, etc.)
+    are usually collated together, the two list comprehensions of the example
+    above can be computed all at once by using the private `_pack()` method
+    of the Loader class:
+
+    ```python
+    def collate(self, batch):
+        x, a = self._pack(batch)
+        return np.array(x), np.array(a)
+    ```
+
+    Additionally, a Loader should implement two main methods that simplify its
+    usage within the TensorFlow/Keras training pipeline:
+
+    - `tf()`: should return a `tf.data` dataset, a generator, or a
+    `keras.utils.Sequence`. Its usage pattern should be as follows:
+
+    `model.fit(loader.tf(), steps_per_epoch=loader.steps_per_epoch)`
+
+    The `steps_per_epoch` property returns the number of batches
+    (as specified by the `batch_size` argument) that are in an epoch and is
+    automatically computed from the data.
+
+    Note that TensorFlow 2.4 or above is required to use this method in a
+    Keras training loop.
+
+    By default, `tf()` will simply return a `tf.data.Dataset.from_generator`
+    dataset obtained from the Loader itself (since Loaders are also Python
+    generators).
+
+    - `tf_signature()`: this method should return the Tensorflow signature of
+    the batches computed by `collate(batch)`, using the `tf.TypeSpec` system.
+    All Datasets have a `signature` property that can be used to compute the
+    TensorFlow signature (which represents the shape, dtype and TypeSpec of a
+    each data matrix in a generic graph) with the
+    `spektral.data.utils.to_tf_signature(signature)` function.
+
+    By default, `tf_signature()` will simply return the Dataset's signature
+    converted to the TensorFlow format.
+
     **Arguments**
 
-    - `dataset`: a Dataset object to load.
-    - `batch_size`: size of the mini-batches.
-    - `epochs`: number of epochs to iterate over the datset. By default (`None`)
-    iterates indefinitely.
+    - `dataset`: a graph Dataset;
+    - `batch_size`: size of the mini-batches;
+    - `epochs`: number of epochs to iterate over the dataset. By default (`None`)
+    iterates indefinitely;
     - `shuffle`: whether to shuffle the data at the start of each epoch.
     """
     def __init__(self, dataset, batch_size=1, epochs=None, shuffle=True):
@@ -44,10 +102,15 @@ def collate(self, batch):
         raise NotImplementedError
 
     def tf(self):
-        raise NotImplementedError
+        if not tf_loader_available:
+            raise RuntimeError('Calling Loader.tf() requires TensorFlow 2.4 '
+                               'or greater.')
+        return tf.data.Dataset.from_generator(
+            lambda: self, output_signature=self.tf_signature())
 
     def tf_signature(self):
-        raise NotImplementedError
+        signature = self.dataset.signature
+        return to_tf_signature(signature)
 
     def _pack(self, batch):
         return [list(elem) for elem in zip(*[g.numpy() for g in batch])]
@@ -59,9 +122,48 @@ def steps_per_epoch(self):
 
 class SingleLoader(Loader):
     """
-    A [Loader]() for single mode.
+    A Loader for single mode.
+
+    This loader produces Tensors representing one graph, with its attributes,
+    edges and labels (usually for node-level prediction). As such, it can only
+    be used with Datasets of length 1 and the `batch_size` cannot be set.
+
+    The loader also supports sample weights through the `sample_weights`
+    argument. If given, then each batch will be a tuple
+    `(inputs, labels, sample_weights)`.
+
+    The `tf()` method of this loader can be used even with TensorFlow versions
+    below 2.4.
+
+    **Arguments**
+
+    - `dataset`: a graph Dataset;
+    - `epochs`: number of epochs to iterate over the dataset. By default (`None`)
+    iterates indefinitely;
+    - `shuffle`: whether to shuffle the data at the start of each epoch;
+    - `sample_weights`: if given, these will be appended to the output
+    automatically.
+
+    **Output**
+
+    Returns a tuple `(inputs, labels)` or `(inputs, labels, sample_weights)`.
+
+    `inputs` are a tuple containing the non-None data matrices of the graph:
+
+    - `x`: same as `dataset[0].x`;
+    - `a`: same as `dataset[0].a` (scipy sparse matrices are converted to
+    SparseTensors);
+    - `e`: same as `dataset[0].e`;
+
+    `labels` is the same as `datsaset[0].y`.
+    If available, `sample_weights` is the same object passed to the constructor.
+
+
     """
     def __init__(self, dataset, epochs=None, sample_weights=None):
+        if len(dataset) != 1:
+            raise ValueError('SingleLoader can only be used with Datasets that'
+                             'have a single graph.')
         self.sample_weights = sample_weights
         super().__init__(dataset, batch_size=1, epochs=epochs, shuffle=False)
 
@@ -69,6 +171,12 @@ def collate(self, batch):
         graph = batch[0]
         output = graph.numpy()
         output = [output[:-1], output[-1]]
+
+        # Sparse matrices to SparseTensors
+        for i in range(len(output)):
+            if sp.issparse(output[i]):
+                output[i] = sp_matrix_to_sp_tensor(output[i])
+
         if self.sample_weights is not None:
             output += [self.sample_weights]
         return tuple(output)
@@ -77,37 +185,82 @@ def tf(self):
         output = self.collate(self.dataset)
         return tf.data.Dataset.from_tensors(output).repeat(self.epochs)
 
-    def tf_signature(self):
-        pass
-
 
 class DisjointLoader(Loader):
     """
-    A [Loader](https://graphneural.network/) for disjoint mode.
+    A Loader for disjoint mode.
+
+    This loader produces batches of graphs as their disjoint union, and supports
+    labels both for graph-level and node-level learning.
+
+    Because in disjoint mode we need a way to keep track of which nodes belong
+    to which graph, the loader will also automatically compute a batch index
+    tensor, containing integer indices that map each node to its corresponding
+    graph in the batch.
+
+    The adjacency matrix will always be returned as a SparseTensor, regardless
+    of the input.
+    Edge attributes will be returned as a sparse edge list of shape
+    `(n_edges, S)`.
+
+    If `node_level=False`, the labels are interpreted as graph-level labels and
+    are stacked along an additional dimension (i.e., `(n_graphs, n_labels)`)
+    If `node_level=True`, then the labels are stacked vertically (i.e.,
+    `(n_nodes, n_labels)`).
+
+    **Arguments**
+
+    - `dataset`: a graph Dataset;
+    - `node_level`: boolean (default `False`), whether to interpret labels as
+    node-level instead of graph-level;
+    - `epochs`: number of epochs to iterate over the dataset. By default (`None`)
+    iterates indefinitely;
+    - `shuffle`: whether to shuffle the data at the start of each epoch.
+
+    **Output**
+
+    For each batch, returns a tuple `(inputs, labels)`.
+
+    `inputs` is a tuple containing:
+
+    - `x`: node attributes stacked along the outermost dimension;
+    - `a`: SparseTensor, the block-diagonal matrix obtained from the adjacency
+    matrices of the batch;
+    - `e`: edge attributes as edge list of shape `(n_edges, S)`;
+
+    If `node_level=False`, `labels` has shape `(n_graphs, n_labels)`;
+    If `node_level=True`, then the labels are stacked vertically, i.e.,
+    `(n_nodes, n_labels)`.
+
     """
+    def __init__(self, dataset, node_level=False, batch_size=1, epochs=None,
+                 shuffle=True):
+        self.node_level = node_level
+        super(DisjointLoader, self).__init__(dataset, batch_size=batch_size,
+                                             epochs=epochs, shuffle=shuffle)
+
     def collate(self, batch):
         packed = self._pack(batch)
-        y = np.array(packed[-1])
-        ret = to_disjoint(*packed[:-1])
-        ret = list(ret)
-        for i in range(len(ret)):
-            if sp.issparse(ret[i]):
-                ret[i] = sp_matrix_to_sp_tensor(ret[i])
-        ret = tuple(ret)
+        if self.node_level:
+            y = np.vstack(packed[-1])
+        else:
+            y = np.array(packed[-1])
+        output = to_disjoint(*packed[:-1])
 
-        return ret, y
+        # Sparse matrices to SparseTensors
+        output = list(output)
+        for i in range(len(output)):
+            if sp.issparse(output[i]):
+                output[i] = sp_matrix_to_sp_tensor(output[i])
+        output = tuple(output)
 
-    def tf(self):
-        if not tf_loader_available:
-            raise RuntimeError('Calling Loader.tf() requires TensorFlow 2.4 '
-                               'or greater.')
-        return tf.data.Dataset.from_generator(
-            lambda: (_ for _ in self), output_signature=self.tf_signature())
+        return output, y
 
     def tf_signature(self):
         signature = self.dataset.signature
         if 'y' in signature:
-            signature['y']['shape'] = prepend_none(signature['y']['shape'])
+            if not self.node_level:
+                signature['y']['shape'] = prepend_none(signature['y']['shape'])
         if 'a' in signature:
             signature['a']['spec'] = tf.SparseTensorSpec
 
@@ -121,21 +274,59 @@ def tf_signature(self):
 
 class BatchLoader(Loader):
     """
-    A [Loader](https://graphneural.network/) for batch mode.
+    A Loader for batch mode.
+
+    This loader returns batches of graphs stacked along an extra dimension,
+    with all "node" dimensions padded to be equal among all graphs.
+
+    If `n_max` is the number of nodes of the biggest graph in the batch, then
+    the padding consist of adding zeros to the node features, adjacency matrix,
+    and edge attributes of each graph so that they have shapes
+    `(n_max, F)`, `(n_max, n_max)`, and `(n_max, n_max, S)` respectively.
+
+    The zero-padding is done batch-wise, which saves up memory at the cost of
+    more computation. If latency is an issue but memory isn't, or if the
+    dataset has graphs with a similar number of nodes, you can use
+    the `PackedBatchLoader` that first zero-pads all the dataset and then
+    iterates over it.
+
+    Note that the adjacency matrix and edge attributes are returned as dense
+    arrays (mostly due to the lack of support for sparse tensor operations for
+    rank >2).
+
+    Only graph-level labels are supported with this loader (i.e., labels are not
+    zero-padded because they are assumed to have no "node" dimensions).
+
+    **Arguments**
+
+    - `dataset`: a graph Dataset;
+    - `batch_size`: size of the mini-batches;
+    - `epochs`: number of epochs to iterate over the dataset. By default (`None`)
+    iterates indefinitely;
+    - `shuffle`: whether to shuffle the data at the start of each epoch.
+
+    **Output**
+
+    For each batch, returns a tuple `(inputs, labels)`.
+
+    `inputs` is a tuple containing:
+
+    - `x`: node attributes, zero-padded and stacked along an extra dimension
+    (shape `(n_graphs, n_max, F)`);
+    - `a`: adjacency matrices (dense), zero-padded and stacked along an extra
+    dimension (shape `(n_graphs, n_max, n_max)`);
+    - `e`: edge attributes (dense), zero-padded and stacked along an extra
+    dimension (shape `(n_graphs, n_max, n_max, S)`).
+
+    `labels` are also stacked along an extra dimension.
+
     """
     def collate(self, batch):
         packed = self._pack(batch)
         y = np.array(packed[-1])
-        ret = to_batch(*packed[:-1])
+        output = to_batch(*packed[:-1])
 
-        return ret, y
-
-    def tf(self):
-        if not tf_loader_available:
-            raise RuntimeError('Calling Loader.tf() requires TensorFlow 2.4 '
-                               'or greater.')
-        return tf.data.Dataset.from_generator(
-            lambda: (_ for _ in self), output_signature=self.tf_signature())
+        return output, y
 
     def tf_signature(self):
         signature = self.dataset.signature
@@ -153,18 +344,26 @@ def tf_signature(self):
 
 class PackedBatchLoader(BatchLoader):
     """
-    A [Loader](https://graphneural.network/) for batch mode, that pre-pads all
-    graphs to have the same number of nodes.
+    A `BatchLoader` that pre-pads the graphs before iterating over the dataset
+    to create the batches.
+
     While using more memory than `BatchLoader`, this loader should reduce the
-    overhead due to padding each batch independently.
-    Use this loader if you have graphs of similar sizes and no outliers (i.e.,
-    anomalous graphs with many more nodes than average).
+    computational overhead of padding each batch independently.
+
+    Use this loader if:
+
+    - memory usage isn't an issue and you want to compute the batches as fast
+    as possible;
+    - the graphs in the dataset have similar sizes and there are no outliers in
+    the dataset (i.e., anomalous graphs with many more nodes than the dataset
+    average).
     """
     def __init__(self, dataset, batch_size=1, epochs=None, shuffle=True):
         super().__init__(dataset, batch_size=batch_size, epochs=epochs, shuffle=shuffle)
         # Drop the Dataset container and work on packed tensors directly
         self.dataset = self._pack(self.dataset)
         self.dataset = to_batch(*self.dataset[:-1]) + (np.array(self.dataset[-1]), )
+
         # Re-instantiate generator after packing dataset
         self._generator = self.generator()
 
diff --git a/spektral/data/utils.py b/spektral/data/utils.py
index 3659a426..ad7b0edb 100644
--- a/spektral/data/utils.py
+++ b/spektral/data/utils.py
@@ -2,7 +2,6 @@
 import tensorflow as tf
 from scipy import sparse as sp
 
-from spektral.layers.ops import sp_batch_to_sp_tensor
 from spektral.utils import pad_jagged_array
 
 
@@ -174,6 +173,11 @@ def prepend_none(t):
 
 
 def to_tf_signature(signature):
+    """
+    Converts a Dataset signature to a TensorFlow signature.
+    :param signature: a Dataset signature.
+    :return: a TensorFlow signature.
+    """
     output = []
     keys = ['x', 'a', 'e', 'i']
     for k in keys:
diff --git a/spektral/datasets/__init__.py b/spektral/datasets/__init__.py
index bc28a7f0..a91f2f61 100644
--- a/spektral/datasets/__init__.py
+++ b/spektral/datasets/__init__.py
@@ -1,5 +1,5 @@
 from .citation import Citation
-from . import graphsage
+from .graphsage import GraphSage, PPI, Reddit
 from . import mnist
 from .ogb import OGB
 from .qm9 import QM9
diff --git a/spektral/transforms/__init__.py b/spektral/transforms/__init__.py
index 104ba434..549af269 100644
--- a/spektral/transforms/__init__.py
+++ b/spektral/transforms/__init__.py
@@ -1,6 +1,9 @@
 from .adj_to_sp_tensor import AdjToSpTensor
+from .constant import Constant
 from .degree import Degree, MaxDegree
 from .gcn_filter import GCNFilter
 from .layer_preprocess import LayerPreprocess
 from .normalize_adj import NormalizeAdj
+from .normalize_one import NormalizeOne
+from .normalize_sphere import NormalizeSphere
 from .one_hot import OneHotLabels
diff --git a/spektral/transforms/adj_to_sp_tensor.py b/spektral/transforms/adj_to_sp_tensor.py
index 8449cb66..d87ff75f 100644
--- a/spektral/transforms/adj_to_sp_tensor.py
+++ b/spektral/transforms/adj_to_sp_tensor.py
@@ -2,6 +2,9 @@
 
 
 class AdjToSpTensor(object):
+    """
+    Converts the adjacency matrix to a SparseTensor.
+    """
     def __call__(self, graph):
         if graph.a is not None:
             graph.a = sp_matrix_to_sp_tensor(graph.a)
diff --git a/spektral/transforms/constant.py b/spektral/transforms/constant.py
index 15264184..26534013 100644
--- a/spektral/transforms/constant.py
+++ b/spektral/transforms/constant.py
@@ -2,6 +2,16 @@
 
 
 class Constant(object):
+    """
+    Concatenates a constant value to the node attributes.
+
+    If the graph doesn't have node attributes, then they are created and set to
+    `value`.
+
+    **Arguments**
+
+    - `value`: the value to concatenate to the node attributes.
+    """
     def __init__(self, value):
         self.value = value
 
diff --git a/spektral/transforms/degree.py b/spektral/transforms/degree.py
index cb54958f..60f7132b 100644
--- a/spektral/transforms/degree.py
+++ b/spektral/transforms/degree.py
@@ -4,13 +4,30 @@
 
 
 class Degree(object):
+    """
+    Concatenates to each node attribute the one-hot degree of the corresponding
+    node.
+
+    If the graph doesn't have node attributes, then they are created and set to
+    the degree.
+
+    The adjacency matrix is expected to have integer entries and the degree is
+    cast to integer before one-hot encoding.
+
+    **Arguments**
+
+    - `max_degree`: the maximum degree of the nodes, i.e., the size of the
+    one-hot vectors.
+    """
     def __init__(self, max_degree):
         self.max_degree = max_degree
 
     def __call__(self, graph):
-        degree = graph.a.sum(1)
+        if 'a' not in graph:
+            raise ValueError('The graph must have an adjacency matrix')
+        degree = graph.a.sum(1).astype(int)
         degree = one_hot(degree, self.max_degree + 1)
-        if graph.x is None:
+        if 'x' not in graph:
             graph.x = degree
         else:
             graph.x = np.concatenate((graph.x, degree), axis=-1)
diff --git a/spektral/transforms/gcn_filter.py b/spektral/transforms/gcn_filter.py
index fff65702..9985e108 100644
--- a/spektral/transforms/gcn_filter.py
+++ b/spektral/transforms/gcn_filter.py
@@ -2,6 +2,20 @@
 
 
 class GCNFilter(object):
+    r"""
+    Normalizes the adjacency matrix as described by
+    [Kipf & Welling (2017)](https://arxiv.org/abs/1609.02907):
+
+    $$
+    \A \leftarrow \hat\D^{-\frac{1}{2}} (\A + \I) \hat\D^{-\frac{1}{2}}
+    $$
+
+    where \( \hat\D_{ii} = 1 + \sum\limits_{j = 1}^{N} \A_{ij} \).
+
+    **Arguments**
+
+    - `symmetric`: If False, then it computes \(\hat\D^{-1} (\A + \I)\) instead.
+    """
     def __init__(self, symmetric=True):
         self.symmetric = symmetric
 
diff --git a/spektral/transforms/normalize_adj.py b/spektral/transforms/normalize_adj.py
index f83528ba..da105cdf 100644
--- a/spektral/transforms/normalize_adj.py
+++ b/spektral/transforms/normalize_adj.py
@@ -2,6 +2,17 @@
 
 
 class NormalizeAdj(object):
+    r"""
+    Normalizes the adjacency matrix as:
+
+    $$
+    \A \leftarrow \D^{-1/2}\A\D^{-1/2}
+    $$
+
+    **Arguments**
+
+    - `symmetric`: If False, then it computes \(\D^{-1}\A\) instead.
+    """
     def __init__(self, symmetric=True):
         self.symmetric = symmetric
 
diff --git a/spektral/transforms/normalize_one.py b/spektral/transforms/normalize_one.py
index 87cd9c7e..5b106ee8 100644
--- a/spektral/transforms/normalize_one.py
+++ b/spektral/transforms/normalize_one.py
@@ -2,6 +2,15 @@
 
 
 class NormalizeOne:
+    r"""
+    Normalizes the node attributes by dividing each row by its sum, so that it
+    sums to 1:
+
+    $$
+        \X_i \leftarrow \frac{\X_i}{\sum_{j=1}^{N} \X_{ij}}
+    $$
+
+    """
     def __call__(self, graph):
         x_sum = np.sum(graph.x, -1)
         x_sum[x_sum == 0] = 1
diff --git a/spektral/transforms/normalize_sphere.py b/spektral/transforms/normalize_sphere.py
index efa0bb65..646b34a2 100644
--- a/spektral/transforms/normalize_sphere.py
+++ b/spektral/transforms/normalize_sphere.py
@@ -2,9 +2,19 @@
 
 
 class NormalizeSphere:
+    r"""
+    Normalizes the node attributes so that they are centered at the origin and
+    contained within a sphere of radius 1:
+
+    $$
+        \X_{i} \leftarrow \frac{\X_{i} - \bar\X}{\max_{i,j} \X_{ij}}
+    $$
+
+    where \( \bar\X \) is the centroid of the node features.
+    """
     def __call__(self, graph):
         offset = np.mean(graph.x, -2, keepdims=True)
-        scale = 1 / np.abs(graph.x).max()
-        graph.x = (graph.x - offset) * scale
+        scale = np.abs(graph.x).max()
+        graph.x = (graph.x - offset) / scale
 
         return graph
diff --git a/spektral/transforms/one_hot.py b/spektral/transforms/one_hot.py
index 0ad172ef..063134f5 100644
--- a/spektral/transforms/one_hot.py
+++ b/spektral/transforms/one_hot.py
@@ -2,6 +2,19 @@
 
 
 class OneHotLabels(object):
+    """
+    One-hot encodes the graph labels along the innermost dimension (also if they
+    are simple scalars).
+
+    Either `depth` or `labels` must be passed as argument.
+
+    **Arguments**
+
+    - `depth`: int, the size of the one-hot vector (labels are intended as
+    indices for a vector of size `depth`);
+    - `labels`: list or tuple, the possible values that the labels can take
+    (labels are one-hot encoded according to where they are found in`labels`).
+    """
     def __init__(self, depth=None, labels=None):
         self.depth = depth
         self.labels = labels
diff --git a/spektral/utils/__init__.py b/spektral/utils/__init__.py
index 13dbcb33..ede707ea 100644
--- a/spektral/utils/__init__.py
+++ b/spektral/utils/__init__.py
@@ -1,4 +1,3 @@
-from .conversion import *
 from .convolution import *
 from .logging import *
 from .misc import *
diff --git a/spektral/utils/conversion.py b/spektral/utils/conversion.py
deleted file mode 100644
index 7b8b232d..00000000
--- a/spektral/utils/conversion.py
+++ /dev/null
@@ -1,203 +0,0 @@
-import networkx as nx
-import numpy as np
-
-from spektral.utils.misc import pad_jagged_array, add_eye_jagged, add_eye_batch, flatten_list
-
-
-# Available conversions: Numpy <-> Networkx <-> SDF
-
-
-def nx_to_adj(graphs):
-    """
-    Converts a list of nx.Graphs to a rank 3 np.array of adjacency matrices
-    of shape `(num_graphs, num_nodes, num_nodes)`.
-    :param graphs: a nx.Graph, or list of nx.Graphs.
-    :return: a rank 3 np.array of adjacency matrices.
-    """
-    if isinstance(graphs, nx.Graph):
-        graphs = [graphs]
-    return np.array([nx.attr_sparse_matrix(g)[0].toarray() for g in graphs])
-
-
-def nx_to_node_features(graphs, keys, post_processing=None):
-    """
-    Converts a list of nx.Graphs to a rank 3 np.array of node features matrices
-    of shape `(num_graphs, num_nodes, num_features)`. Optionally applies a
-    post-processing function to each individual attribute in the nx Graphs.
-    :param graphs: a nx.Graph, or a list of nx.Graphs;
-    :param keys: a list of keys with which to index node attributes in the nx
-    Graphs.
-    :param post_processing: a list of functions with which to post process each
-    attribute associated to a key. `None` can be passed as post-processing 
-    function to leave the attribute unchanged.
-    :return: a rank 3 np.array of feature matrices
-    """
-    if post_processing is not None:
-        if len(post_processing) != len(keys):
-            raise ValueError('post_processing must contain an element for each key')
-        for i in range(len(post_processing)):
-            if post_processing[i] is None:
-                post_processing[i] = lambda x: x
-
-    if isinstance(graphs, nx.Graph):
-        graphs = [graphs]
-
-    output = []
-    for g in graphs:
-        node_features = []
-        for v in g.nodes.values():
-            f = [v[key] for key in keys]
-            if post_processing is not None:
-                f = [op(_) for op, _ in zip(post_processing, f)]
-            f = flatten_list(f)
-            node_features.append(f)
-        output.append(np.array(node_features))
-
-    return np.array(output)
-
-
-def nx_to_edge_features(graphs, keys, post_processing=None):
-    """
-    Converts a list of nx.Graphs to a rank 4 np.array of edge features matrices
-    of shape `(num_graphs, num_nodes, num_nodes, num_features)`.
-    Optionally applies a post-processing function to each attribute in the nx
-    graphs.
-    :param graphs: a nx.Graph, or a list of nx.Graphs;
-    :param keys: a list of keys with which to index edge attributes.
-    :param post_processing: a list of functions with which to post process each
-    attribute associated to a key. `None` can be passed as post-processing 
-    function to leave the attribute unchanged.
-    :return: a rank 3 np.array of feature matrices
-    """
-    if post_processing is not None:
-        if len(post_processing) != len(keys):
-            raise ValueError('post_processing must contain an element for each key')
-        for i in range(len(post_processing)):
-            if post_processing[i] is None:
-                post_processing[i] = lambda x: x
-
-    if isinstance(graphs, nx.Graph):
-        graphs = [graphs]
-
-    output = []
-    for g in graphs:
-        edge_features = []
-        for key in keys:
-            ef = nx.attr_sparse_matrix(g, edge_attr=key)[0].toarray()
-            if ef.ndim == 2:
-                ef = ef[..., None]  # Make it three dimensional to concatenate
-            edge_features.append(ef)
-        if post_processing is not None:
-            edge_features = [op(_) for op, _ in zip(post_processing, edge_features)]
-        if len(edge_features) > 1:
-            edge_features = np.concatenate(edge_features, axis=-1)
-        else:
-            edge_features = np.array(edge_features[0])
-        output.append(edge_features)
-
-    return np.array(output)
-
-
-def nx_to_numpy(graphs, auto_pad=True, self_loops=True, nf_keys=None,
-                ef_keys=None, nf_postprocessing=None, ef_postprocessing=None):
-    """
-    Converts a list of nx.Graphs to numpy format (adjacency, node attributes,
-    and edge attributes matrices).
-    :param graphs: a nx.Graph, or list of nx.Graphs;
-    :param auto_pad: whether to zero-pad all matrices to have graphs with the
-    same dimension (set this to true if you don't want to deal with manual
-    batching for different-size graphs.
-    :param self_loops: whether to add self-loops to the graphs.
-    :param nf_keys: a list of keys with which to index node attributes. If None,
-    returns None as node attributes matrix.
-    :param ef_keys: a list of keys with which to index edge attributes. If None,
-    returns None as edge attributes matrix.
-    :param nf_postprocessing: a list of functions with which to post process each
-    node attribute associated to a key. `None` can be passed as post-processing
-    function to leave the attribute unchanged.
-    :param ef_postprocessing: a list of functions with which to post process each
-    edge attribute associated to a key. `None` can be passed as post-processing
-    function to leave the attribute unchanged.
-    :return:
-    - adjacency matrices of shape `(num_samples, num_nodes, num_nodes)`
-    - node attributes matrices of shape `(num_samples, num_nodes, node_features_dim)`
-    - edge attributes matrices of shape `(num_samples, num_nodes, num_nodes, edge_features_dim)`
-    """
-    adj = nx_to_adj(graphs)
-    if nf_keys is not None:
-        nf = nx_to_node_features(graphs, nf_keys, post_processing=nf_postprocessing)
-    else:
-        nf = None
-    if ef_keys is not None:
-        ef = nx_to_edge_features(graphs, ef_keys, post_processing=ef_postprocessing)
-    else:
-        ef = None
-
-    if self_loops:
-        if adj.ndim == 1:  # Jagged array
-            adj = add_eye_jagged(adj)
-            adj = np.array([np.clip(a_, 0, 1) for a_ in adj])
-        else:  # Rank 3 tensor
-            adj = add_eye_batch(adj)
-            adj = np.clip(adj, 0, 1)
-
-    if auto_pad:
-        # Pad all arrays to represent k-nodes graphs
-        k = max([_.shape[-1] for _ in adj])
-        adj = pad_jagged_array(adj, (k, k))
-        if nf is not None:
-            nf = pad_jagged_array(nf, (k, -1))
-        if ef is not None:
-            ef = pad_jagged_array(ef, (k, k, -1))
-
-    return adj, nf, ef
-
-
-def numpy_to_nx(adj, node_features=None, edge_features=None, nf_name=None,
-                ef_name=None):
-    """
-    Converts graphs in numpy format to a list of nx.Graphs.
-    :param adj: adjacency matrices of shape `(num_samples, num_nodes, num_nodes)`.
-    If there is only one sample, the first dimension can be dropped.
-    :param node_features: optional node attributes matrices of shape `(num_samples, num_nodes, node_features_dim)`.
-    If there is only one sample, the first dimension can be dropped.
-    :param edge_features: optional edge attributes matrices of shape `(num_samples, num_nodes, num_nodes, edge_features_dim)`
-    If there is only one sample, the first dimension can be dropped.
-    :param nf_name: optional name to assign to node attributes in the nx.Graphs
-    :param ef_name: optional name to assign to edge attributes in the nx.Graphs
-    :return: a list of nx.Graphs (or a single nx.Graph is there is only one sample)
-    """
-    if adj.ndim == 2:
-        adj = adj[None, ...]
-        if node_features is not None:
-            if nf_name is None:
-                nf_name = 'node_features'
-            node_features = node_features[None, ...]
-            if node_features.ndim != 3:
-                raise ValueError('node_features must have shape (batch, N, F) '
-                                 'or (N, F).')
-        if edge_features is not None:
-            if ef_name is None:
-                ef_name = 'edge_features'
-            edge_features = edge_features[None, ...]
-            if edge_features.ndim != 4:
-                raise ValueError('edge_features must have shape (batch, N, N, S) '
-                                 'or (N, N, S).')
-
-    output = []
-    for i in range(adj.shape[0]):
-        g = nx.from_numpy_array(adj[i])
-        g.remove_nodes_from(list(nx.isolates(g)))
-
-        if node_features is not None:
-            node_attrs = {n: {nf_name: node_features[i, n]} for n in g.nodes}
-            nx.set_node_attributes(g, node_attrs, nf_name)
-        if edge_features is not None:
-            edge_attrs = {e: {ef_name: edge_features[i, e[0], e[1]]} for e in g.edges}
-            nx.set_edge_attributes(g, edge_attrs, ef_name)
-        output.append(g)
-
-    if len(output) == 1:
-        return output[0]
-    else:
-        return output
diff --git a/spektral/utils/convolution.py b/spektral/utils/convolution.py
index 8ae3a848..30a78add 100644
--- a/spektral/utils/convolution.py
+++ b/spektral/utils/convolution.py
@@ -191,3 +191,26 @@ def chebyshev_filter(A, k, symmetric=True):
     return T_k
 
 
+def add_self_loops(a, value=1):
+    """
+    Sets the inner diagonals of `a` to `value`.
+    :param a: a np.array or scipy.sparse matrix, the innermost two dimensions
+    must be equal.
+    :param value: value to set the diagonals to.
+    :return: a np.array or scipy.sparse matrix with the same shape as `a`.
+    """
+    a = a.copy()
+    if len(a.shape) < 2:
+        raise ValueError('a must have at least rank 2')
+    n = a.shape[-1]
+    if n != a.shape[-2]:
+        raise ValueError('Innermost two dimensions must be equal. Got {}'
+                         .format(a.shape))
+    if sp.issparse(a):
+        a = a.tolil()
+        a.setdiag(value)
+        return a.tocsr()
+    else:
+        idx = np.arange(n)
+        a[..., idx, idx] = value
+        return a
\ No newline at end of file
diff --git a/spektral/utils/io.py b/spektral/utils/io.py
index 1beb6af3..fa5bfedb 100644
--- a/spektral/utils/io.py
+++ b/spektral/utils/io.py
@@ -6,8 +6,6 @@
 import numpy as np
 import pandas as pd
 
-from spektral.chem import get_atomic_num
-
 
 def load_binary(filename):
     """
@@ -183,6 +181,36 @@ def dump_txt(obj, filename, **kwargs):
 #  'name': 'gdb_54964',
 #  'properties': []}
 HEADER_SIZE = 3
+NUM_TO_SYMBOL = {1: 'H', 2: 'He', 3: 'Li', 4: 'Be', 5: 'B', 6: 'C', 7: 'N',
+                 8: 'O', 9: 'F', 10: 'Ne', 11: 'Na', 12: 'Mg', 13: 'Al',
+                 14: 'Si', 15: 'P', 16: 'S', 17: 'Cl', 18: 'Ar', 19: 'K',
+                 20: 'Ca', 21: 'Sc', 22: 'Ti', 23: 'V', 24: 'Cr', 25: 'Mn',
+                 26: 'Fe', 27: 'Co', 28: 'Ni', 29: 'Cu', 30: 'Zn', 31: 'Ga',
+                 32: 'Ge', 33: 'As', 34: 'Se', 35: 'Br', 36: 'Kr', 37: 'Rb',
+                 38: 'Sr', 39: 'Y', 40: 'Zr', 41: 'Nb', 42: 'Mo', 43: 'Tc',
+                 44: 'Ru', 45: 'Rh', 46: 'Pd', 47: 'Ag', 48: 'Cd', 49: 'In',
+                 50: 'Sn', 51: 'Sb', 52: 'Te', 53: 'I', 54: 'Xe', 55: 'Cs',
+                 56: 'Ba', 57: 'La', 58: 'Ce', 59: 'Pr', 60: 'Nd', 61: 'Pm',
+                 62: 'Sm', 63: 'Eu', 64: 'Gd', 65: 'Tb', 66: 'Dy', 67: 'Ho',
+                 68: 'Er', 69: 'Tm', 70: 'Yb', 71: 'Lu', 72: 'Hf', 73: 'Ta',
+                 74: 'W', 75: 'Re', 76: 'Os', 77: 'Ir', 78: 'Pt', 79: 'Au',
+                 80: 'Hg', 81: 'Tl', 82: 'Pb', 83: 'Bi', 84: 'Po', 85: 'At',
+                 86: 'Rn', 87: 'Fr', 88: 'Ra', 89: 'Ac', 90: 'Th', 91: 'Pa',
+                 92: 'U', 93: 'Np', 94: 'Pu', 95: 'Am', 96: 'Cm', 97: 'Bk',
+                 98: 'Cf', 99: 'Es', 100: 'Fm', 101: 'Md', 102: 'No', 103: 'Lr',
+                 104: 'Rf', 105: 'Db', 106: 'Sg', 107: 'Bh', 108: 'Hs',
+                 109: 'Mt', 110: 'Ds', 111: 'Rg', 112: 'Cn', 113: 'Nh',
+                 114: 'Fl', 115: 'Mc', 116: 'Lv', 117: 'Ts', 118: 'Og'}
+SYMBOL_TO_NUM = {v: k for k, v in NUM_TO_SYMBOL.items()}
+
+
+def _get_atomic_num(symbol):
+    """
+    Given an atomic symbol (e.g., 'C'), returns its atomic number (e.g., 6)
+    :param symbol: string, atomic symbol
+    :return: int <= 118
+    """
+    return SYMBOL_TO_NUM[symbol.lower().capitalize()]
 
 
 def _parse_header(sdf):
@@ -222,7 +250,7 @@ def _parse_atoms_block(sdf, n_atoms):
     atoms = []
     for i, v in enumerate(values):
         coords = np.array([float(v[pos:pos+10]) for pos in range(0, 30, 10)])
-        atomic_num = get_atomic_num(v[31:34].strip())
+        atomic_num = _get_atomic_num(v[31:34].strip())
         iso = int(v[34:36])
         charge = int(v[36:39])
         info = np.array([int(v[pos:pos+3]) for pos in range(39, len(v), 3)])
diff --git a/spektral/utils/misc.py b/spektral/utils/misc.py
index 94ee7864..3cbbf536 100644
--- a/spektral/utils/misc.py
+++ b/spektral/utils/misc.py
@@ -1,5 +1,4 @@
 import numpy as np
-from scipy import sparse as sp
 
 
 def pad_jagged_array(x, target_shape):
@@ -27,88 +26,6 @@ def pad_jagged_array(x, target_shape):
     return output
 
 
-def add_eye(x):
-    """
-    Adds the identity matrix to the given matrix.
-    :param x: a rank 2 np.array or scipy.sparse matrix
-    :return: a rank 2 np.array or scipy.sparse matrix
-    """
-    if x.ndim != 2:
-        raise ValueError('X must be of rank 2 but has rank {}.'.format(x.ndim))
-    if sp.issparse(x):
-        eye = sp.eye(x.shape[0])
-    else:
-        eye = np.eye(x.shape[0])
-    return x + eye
-
-
-def sub_eye(x):
-    """
-    Subtracts the identity matrix from the given matrix.
-    :param x: a rank 2 np.array or scipy.sparse matrix
-    :return: a rank 2 np.array or scipy.sparse matrix
-    """
-    if x.ndim != 2:
-        raise ValueError('x must be of rank 2 but has rank {}.'.format(x.ndim))
-    if sp.issparse(x):
-        eye = sp.eye(x.shape[0])
-    else:
-        eye = np.eye(x.shape[0])
-    return x - eye
-
-
-def add_eye_batch(x):
-    """
-    Adds the identity matrix to each submatrix of the given rank 3 array.
-    :param x: a rank 3 np.array
-    :return: a rank 3 np.array
-    """
-    if x.ndim != 3:
-        raise ValueError('x must be of rank 3 but has rank {}.'.format(x.ndim))
-    return x + np.eye(x.shape[1])[None, ...]
-
-
-def sub_eye_batch(x):
-    """
-    Subtracts the identity matrix from each submatrix of the given rank 3
-    array.
-    :param x: a rank 3 np.array
-    :return: a rank 3 np.array
-    """
-    if x.ndim != 3:
-        raise ValueError('x must be of rank 3 but has rank {}.'.format(x.ndim))
-    return x - np.repeat(np.eye(x.shape[1])[None, ...], x.shape[0], axis=0)
-
-
-def add_eye_jagged(x):
-    """
-    Adds the identity matrix to each submatrix of the given rank 3 jagged array.
-    :param x: a rank 3 jagged np.array
-    :return: a rank 3 jagged np.array
-    """
-    x_out = x.copy()
-    for i in range(len(x)):
-        if x[i].ndim != 2:
-            raise ValueError('Jagged array must only contain 2d slices')
-        x_out[i] = add_eye(x[i])
-    return x_out
-
-
-def sub_eye_jagged(x):
-    """
-    Subtracts the identity matrix from each submatrix of the given rank 3
-    jagged array.
-    :param x: a rank 3 jagged np.array
-    :return: a rank 3 jagged np.array
-    """
-    x_out = x.copy()
-    for i in range(len(x)):
-        if x[i].ndim != 2:
-            raise ValueError('Jagged array must only contain 2d slices')
-        x_out[i] = sub_eye(x[i])
-    return x_out
-
-
 def one_hot(x, depth):
     """
     One-hot encodes the integer array `x` in an array of length `depth`.
@@ -143,32 +60,7 @@ def label_to_one_hot(x, labels):
     return one_hot(out, depth)
 
 
-def add_self_loops(a, value=1):
-    """
-    Sets the inner diagonals of `a` to `value`.
-    :param a: a np.array or scipy.sparse matrix, the innermost two dimensions
-    must be equal.
-    :param value: value to set the diagonals to.
-    :return: a np.array or scipy.sparse matrix with the same shape as `a`.
-    """
-    a = a.copy()
-    if len(a.shape) < 2:
-        raise ValueError('a must have at least rank 2')
-    n = a.shape[-1]
-    if n != a.shape[-2]:
-        raise ValueError('Innermost two dimensions must be equal. Got {}'
-                         .format(a.shape))
-    if sp.issparse(a):
-        a = a.tolil()
-        a.setdiag(value)
-        return a.tocsr()
-    else:
-        idx = np.arange(n)
-        a[..., idx, idx] = value
-        return a
-
-
-def flatten_list_gen(alist):
+def _flatten_list_gen(alist):
     """
     Performs a depth-first visit of an arbitrarily nested list and yields its 
     element in order. 
@@ -177,7 +69,7 @@ def flatten_list_gen(alist):
     """
     for item in alist:
         if isinstance(item, (list, tuple, np.ndarray)):
-            for i in flatten_list_gen(item):
+            for i in _flatten_list_gen(item):
                 yield i
         else:
             yield item
@@ -191,6 +83,6 @@ def flatten_list(alist):
     :return: a 1D Python list with the flattened elements as returned by a 
              depth-first search.
     """
-    return list(flatten_list_gen(alist))
+    return list(_flatten_list_gen(alist))
 
 
diff --git a/tests/test_data/test_loaders.py b/tests/test_data/test_loaders.py
index c420fd00..a75ea103 100644
--- a/tests/test_data/test_loaders.py
+++ b/tests/test_data/test_loaders.py
@@ -23,22 +23,34 @@ class TestDatasetSingle(Dataset):
     """
     def read(self):
         n = 10
-        return [
-            Graph(x=np.random.rand(n, f), a=sp.csr_matrix(np.random.randint(0, 2, (n, n))), e=np.random.rand(n, n, s),
-                  y=np.array(n * [[0., 1.]]))
-        ]
+        return [Graph(x=np.random.rand(n, f),
+                      a=sp.csr_matrix(np.random.randint(0, 2, (n, n))),
+                      e=np.random.rand(n, n, s),
+                      y=np.array(n * [[0., 1.]]))]
 
 
 class TestDataset(Dataset):
     """
-    A dataset with many graphs
+    A dataset with many graphs and graph-level labels
     """
     def read(self):
-        return [
-            Graph(x=np.random.rand(n, f), a=sp.csr_matrix(np.random.randint(0, 2, (n, n))), e=np.random.rand(n, n, s),
-                  y=np.array([0., 1.]))
-            for n in ns
-        ]
+        return [Graph(x=np.random.rand(n, f),
+                      a=sp.csr_matrix(np.random.randint(0, 2, (n, n))),
+                      e=np.random.rand(n, n, s),
+                      y=np.array([0., 1.]))
+                for n in ns]
+
+
+class TestDatasetDsjNode(Dataset):
+    """
+    A dataset with many graphs and node-level labels
+    """
+    def read(self):
+        return [Graph(x=np.random.rand(n, f),
+                      a=sp.csr_matrix(np.random.randint(0, 2, (n, n))),
+                      e=np.random.rand(n, n, s),
+                      y=np.ones((n, 2)))
+                for n in ns]
 
 
 def test_single():
@@ -69,6 +81,21 @@ def test_disjoint():
     assert y.shape == (graphs_in_batch, 2)
 
 
+def test_disjoint_node():
+    data = TestDatasetDsjNode()
+    loader = DisjointLoader(data, node_level=True, batch_size=batch_size,
+                            epochs=1, shuffle=False)
+    batches = [b for b in loader]
+
+    (x, a, e, i), y = batches[-1]
+    n = sum(ns[-graphs_in_batch:])
+    assert x.shape == (n, f)
+    assert a.shape == (n, n)
+    assert len(e.shape) == 2 and e.shape[1] == s  # Avoid counting edges
+    assert i.shape == (n, )
+    assert y.shape == (n, 2)
+
+
 def test_batch():
     data = TestDataset()
     loader = BatchLoader(data, batch_size=batch_size, epochs=1, shuffle=False)

From 554a7466d6a56f8d68d1e441a55290bd9f89aad1 Mon Sep 17 00:00:00 2001
From: Daniele Grattarola <daniele.grattarola@gmail.com>
Date: Fri, 20 Nov 2020 17:31:43 +0100
Subject: [PATCH 34/57] Add tests for transforms Fix minor issues

---
 docs/mkdocs.yml                               |  4 +-
 docs/templates/creating-dataset.md            |  1 +
 docs/templates/creating-layer.md              |  1 +
 .../{data-representation.md => data-modes.md} |  0
 spektral/transforms/degree.py                 |  2 +
 spektral/utils/convolution.py                 |  3 +-
 tests/test_transforms/test_transforms.py      | 99 +++++++++++++++++++
 7 files changed, 108 insertions(+), 2 deletions(-)
 create mode 100644 docs/templates/creating-dataset.md
 create mode 100644 docs/templates/creating-layer.md
 rename docs/templates/{data-representation.md => data-modes.md} (100%)
 create mode 100644 tests/test_transforms/test_transforms.py

diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml
index 352bafa3..2f1de14d 100644
--- a/docs/mkdocs.yml
+++ b/docs/mkdocs.yml
@@ -30,7 +30,9 @@ nav:
 - Home: index.md
 - Tutorials:
     - Getting started: getting-started.md
-    - Data representation: data-representation.md
+    - Data modes: data-modes.md
+    - Creating a Dataset: creating-dataset.md
+    - Creating a Layer: creating-layer.md
     - Examples: examples.md
 - Layers:
     - Convolutional Layers: layers/convolution.md
diff --git a/docs/templates/creating-dataset.md b/docs/templates/creating-dataset.md
new file mode 100644
index 00000000..eb03b22a
--- /dev/null
+++ b/docs/templates/creating-dataset.md
@@ -0,0 +1 @@
+# Creating a Custom Dataset
\ No newline at end of file
diff --git a/docs/templates/creating-layer.md b/docs/templates/creating-layer.md
new file mode 100644
index 00000000..024972c2
--- /dev/null
+++ b/docs/templates/creating-layer.md
@@ -0,0 +1 @@
+# Creating a Message-Passing Layer
\ No newline at end of file
diff --git a/docs/templates/data-representation.md b/docs/templates/data-modes.md
similarity index 100%
rename from docs/templates/data-representation.md
rename to docs/templates/data-modes.md
diff --git a/spektral/transforms/degree.py b/spektral/transforms/degree.py
index 60f7132b..37f357e6 100644
--- a/spektral/transforms/degree.py
+++ b/spektral/transforms/degree.py
@@ -26,6 +26,8 @@ def __call__(self, graph):
         if 'a' not in graph:
             raise ValueError('The graph must have an adjacency matrix')
         degree = graph.a.sum(1).astype(int)
+        if isinstance(degree, np.matrix):
+            degree = np.asarray(degree)[:, 0]
         degree = one_hot(degree, self.max_degree + 1)
         if 'x' not in graph:
             graph.x = degree
diff --git a/spektral/utils/convolution.py b/spektral/utils/convolution.py
index 30a78add..69c47d63 100644
--- a/spektral/utils/convolution.py
+++ b/spektral/utils/convolution.py
@@ -120,7 +120,8 @@ def gcn_filter(A, symmetric=True):
             out[i][np.diag_indices_from(out[i])] += 1
             out[i] = normalized_adjacency(out[i], symmetric=symmetric)
     else:
-        out = out.tocsr()
+        if hasattr(out, 'tocsr'):
+            out = out.tocsr()
         with warnings.catch_warnings():
             warnings.simplefilter("ignore")
             out[np.diag_indices_from(out)] += 1
diff --git a/tests/test_transforms/test_transforms.py b/tests/test_transforms/test_transforms.py
new file mode 100644
index 00000000..dd2cc073
--- /dev/null
+++ b/tests/test_transforms/test_transforms.py
@@ -0,0 +1,99 @@
+import numpy as np
+import scipy.sparse as sp
+
+from spektral.data import Graph
+from spektral.transforms import (AdjToSpTensor, Constant, Degree, GCNFilter,
+                                 LayerPreprocess, NormalizeAdj, NormalizeOne,
+                                 NormalizeSphere, OneHotLabels)
+
+N = 10
+F = 3
+S = 4
+n_labels = 2
+x = np.ones((N, F))
+a = sp.csr_matrix(np.ones((N, N)))
+e = np.ones((N * N, S))
+y_gl = np.ones(n_labels)
+y_nl = np.ones((N, n_labels))
+y_sc = 1
+
+
+g_gl = Graph(x=x, a=a, e=e, y=y_gl)
+g_nl = Graph(x=x, a=a, e=e, y=y_nl)
+g_sc = Graph(x=x, a=a, e=e, y=y_sc)
+
+
+def test_adj_to_sp_tensor():
+    t = AdjToSpTensor()
+    g = Graph(x=x, a=a, e=e, y=y_gl)
+    assert callable(t)
+    t(g)
+
+
+def test_constant():
+    t = Constant(10)
+    assert callable(t)
+    g = Graph(x=x, a=a, e=e, y=y_gl)
+    t(g)
+    g = Graph(x=None, a=a, e=e, y=y_gl)
+    t(g)
+
+
+def test_degree():
+    t = Degree(10)
+    assert callable(t)
+    g = Graph(x=x, a=a, e=e, y=y_gl)
+    t(g)
+    g = Graph(x=None, a=a, e=e, y=y_gl)
+    t(g)
+
+
+def test_gcn_filter():
+    t = GCNFilter()
+    assert callable(t)
+    g = Graph(x=x, a=a, e=e, y=y_nl)
+    t(g)
+    g = Graph(x=x, a=a.A, e=e, y=y_nl)
+    t(g)
+
+
+def test_layer_preprocess():
+    from spektral.layers import GraphConv
+    t = LayerPreprocess(GraphConv)
+    assert callable(t)
+    g = Graph(x=x, a=a, e=e, y=y_nl)
+    t(g)
+
+
+def test_normalize_adj():
+    t = NormalizeAdj
+    assert callable(t)
+    g = Graph(x=x, a=a, e=e, y=y_nl)
+    t(g)
+    g = Graph(x=x, a=a.A, e=e, y=y_nl)
+    t(g)
+
+
+def test_normalize_one():
+    t = NormalizeOne()
+    assert callable(t)
+    g = Graph(x=x, a=a, e=e, y=y_gl)
+    t(g)
+
+
+def test_normalize_sphere():
+    t = NormalizeSphere()
+    assert callable(t)
+    g = Graph(x=x, a=a, e=e, y=y_gl)
+    t(g)
+
+
+def test_one_hot():
+    t = OneHotLabels(depth=2)
+    assert callable(t)
+    g = Graph(x=x, a=a, e=e, y=y_gl)
+    t(g)
+    g = Graph(x=x, a=a, e=e, y=y_nl)
+    t(g)
+    g = Graph(x=x, a=a, e=e, y=y_sc)
+    t(g)
\ No newline at end of file

From 081e1841a87c8b420e4242011813d4c63737f830 Mon Sep 17 00:00:00 2001
From: Daniele Grattarola <daniele.grattarola@gmail.com>
Date: Fri, 20 Nov 2020 18:30:59 +0100
Subject: [PATCH 35/57] Refactor N, F, S into n_nodes, n_node_features,
 n_edge_features

---
 docs/templates/getting-started.md             | 27 ++++++++++
 examples/graph_prediction/custom_dataset.py   |  4 +-
 .../graph_prediction/ogbg-mol-esol_batch.py   |  8 +--
 .../graph_prediction/ogbg-mol-hiv_disjoint.py |  6 +--
 examples/graph_prediction/qm9_batch.py        |  6 +--
 examples/graph_prediction/qm9_disjoint.py     |  6 +--
 examples/graph_prediction/tud_disjoint.py     |  4 +-
 examples/node_prediction/citation_arma.py     |  6 +--
 examples/node_prediction/citation_cheby.py    |  6 +--
 examples/node_prediction/citation_gat.py      |  6 +--
 examples/node_prediction/citation_gat_fast.py |  4 +-
 examples/node_prediction/citation_gcn.py      |  7 ++-
 examples/node_prediction/citation_gcn_fast.py |  4 +-
 .../node_prediction/citation_simple_gc.py     |  6 +--
 examples/node_prediction/ogbn-arxiv_gcn.py    |  4 +-
 examples/other/node_clustering_mincut.py      |  2 +-
 spektral/data/dataset.py                      | 36 +++++++-------
 spektral/data/graph.py                        | 49 ++++++++++---------
 spektral/data/loaders.py                      | 10 ++--
 spektral/data/utils.py                        | 28 +++++------
 spektral/layers/base.py                       |  8 +--
 spektral/layers/convolutional/agnn_conv.py    |  4 +-
 spektral/layers/convolutional/appnp.py        |  4 +-
 spektral/layers/convolutional/arma_conv.py    |  4 +-
 spektral/layers/convolutional/cheb_conv.py    |  4 +-
 spektral/layers/convolutional/crystal_conv.py |  6 +--
 .../layers/convolutional/diffusion_conv.py    | 28 +++++------
 spektral/layers/convolutional/ecc_conv.py     | 22 ++++-----
 spektral/layers/convolutional/edge_conv.py    |  4 +-
 .../layers/convolutional/gated_graph_conv.py  |  4 +-
 spektral/layers/convolutional/gin_conv.py     |  4 +-
 .../layers/convolutional/graph_attention.py   |  8 +--
 spektral/layers/convolutional/graph_conv.py   |  4 +-
 .../layers/convolutional/graph_conv_skip.py   |  4 +-
 .../layers/convolutional/graphsage_conv.py    |  4 +-
 .../layers/convolutional/message_passing.py   |  4 +-
 spektral/layers/convolutional/tag_conv.py     |  4 +-
 spektral/layers/ops/modes.py                  |  8 +--
 spektral/layers/ops/scatter.py                | 20 ++++----
 spektral/layers/pooling/diff_pool.py          |  6 +--
 spektral/layers/pooling/global_pool.py        | 44 ++++++++---------
 spektral/layers/pooling/mincut_pool.py        |  8 +--
 spektral/layers/pooling/sag_pool.py           | 14 +++---
 spektral/layers/pooling/topk_pool.py          | 14 +++---
 spektral/transforms/constant.py               |  2 +-
 tests/test_data/test_dataset.py               | 12 ++---
 tests/test_data/test_graph.py                 |  3 +-
 tests/test_data/test_loaders.py               |  2 +-
 tests/test_data/test_utils.py                 |  2 +-
 tests/test_transforms/test_transforms.py      |  2 +-
 50 files changed, 259 insertions(+), 227 deletions(-)

diff --git a/docs/templates/getting-started.md b/docs/templates/getting-started.md
index ec532331..4822bd25 100644
--- a/docs/templates/getting-started.md
+++ b/docs/templates/getting-started.md
@@ -5,6 +5,33 @@ Spektral is designed according to the guiding principles of the Keras API to mak
 The most important modules of Spektral are `layers.convolutional` and `layers.pooling`, which offer a number of popular layers to start building graph neural networks (GNNs) right away.     
 Because Spektral is designed as an extension of Keras, you can plug any Spektral layer into an existing Keras `Model` without modifications. 
 
+In this page we will go over the main features of Spektral while creating a GNN for graph classification. 
+
+## Graphs
+
+A graph is a mathematical object that represents relations between objects. We call the objects "nodes" and the relations "edges". 
+
+Both the nodes and the edges can have vector attributes (or features).
+
+In Spektral, graphs are represented with instances of `spektral.data.Graph` which contain:
+
+- `a`: the **adjacency matrix** - usually a `scipy.sparse` matrix of shape `(n_nodes, n_nodes)`. 
+- `x`: the **node attributes** - represented by a `np.array` of shape `(n_nodes, n_node_attributes)`.
+- `e`: the **edge attributes** - usually represented in a sparse edge list format, with a `np.array` of shape `(n_edges, n_edge_attributes)`.
+- `y`: the **labels** - can represent anything, from graph labels to node labels, or even something else. 
+
+
+
+
+
+
+
+
+
+
+
+
+
 ## Node classification on citation networks
 
 In this example, we will build a simple [Graph Convolutional Network](https://arxiv.org/abs/1609.02907) for semi-supervised classification of nodes.
diff --git a/examples/graph_prediction/custom_dataset.py b/examples/graph_prediction/custom_dataset.py
index eb689270..db3032b3 100644
--- a/examples/graph_prediction/custom_dataset.py
+++ b/examples/graph_prediction/custom_dataset.py
@@ -84,8 +84,8 @@ def make_graph():
 dataset = MyDataset(1000, transforms=NormalizeAdj())
 
 # Parameters
-F = dataset.F          # Dimension of node features
-n_out = dataset.n_out  # Dimension of the target
+F = dataset.n_node_features  # Dimension of node features
+n_out = dataset.n_labels     # Dimension of the target
 
 # Train/valid/test split
 idxs = np.random.permutation(len(dataset))
diff --git a/examples/graph_prediction/ogbg-mol-esol_batch.py b/examples/graph_prediction/ogbg-mol-esol_batch.py
index 21828032..85a9e4e5 100644
--- a/examples/graph_prediction/ogbg-mol-esol_batch.py
+++ b/examples/graph_prediction/ogbg-mol-esol_batch.py
@@ -31,10 +31,10 @@
 dataset = OGB(ogb_dataset)
 
 # Parameters
-N = max(g.N for g in dataset)
-F = dataset.F          # Dimension of node features
-S = dataset.S          # Dimension of edge features
-n_out = dataset.n_out  # Dimension of the target
+N = max(g.n_nodes for g in dataset)
+F = dataset.n_node_features  # Dimension of node features
+S = dataset.n_edge_features  # Dimension of edge features
+n_out = dataset.n_labels     # Dimension of the target
 
 # Train/test split
 idx = ogb_dataset.get_idx_split()
diff --git a/examples/graph_prediction/ogbg-mol-hiv_disjoint.py b/examples/graph_prediction/ogbg-mol-hiv_disjoint.py
index e73a26db..c5ce5f9f 100644
--- a/examples/graph_prediction/ogbg-mol-hiv_disjoint.py
+++ b/examples/graph_prediction/ogbg-mol-hiv_disjoint.py
@@ -33,9 +33,9 @@
 dataset = OGB(ogb_dataset)
 
 # Parameters
-F = dataset.F          # Dimension of node features
-S = dataset.S          # Dimension of edge features
-n_out = dataset.n_out  # Dimension of the target
+F = dataset.n_node_features  # Dimension of node features
+S = dataset.n_edge_features  # Dimension of edge features
+n_out = dataset.n_labels     # Dimension of the target
 
 # Train/test split
 idx = ogb_dataset.get_idx_split()
diff --git a/examples/graph_prediction/qm9_batch.py b/examples/graph_prediction/qm9_batch.py
index 96e7f523..81540275 100644
--- a/examples/graph_prediction/qm9_batch.py
+++ b/examples/graph_prediction/qm9_batch.py
@@ -25,9 +25,9 @@
 dataset = QM9(amount=1000)  # Set amount=None to train on whole dataset
 
 # Parameters
-F = dataset.F          # Dimension of node features
-S = dataset.S          # Dimension of edge features
-n_out = dataset.n_out  # Dimension of the target
+F = dataset.n_node_features  # Dimension of node features
+S = dataset.n_edge_features  # Dimension of edge features
+n_out = dataset.n_labels     # Dimension of the target
 
 # Train/test split
 idxs = np.random.permutation(len(dataset))
diff --git a/examples/graph_prediction/qm9_disjoint.py b/examples/graph_prediction/qm9_disjoint.py
index 4fb7cfa7..c7e64e24 100644
--- a/examples/graph_prediction/qm9_disjoint.py
+++ b/examples/graph_prediction/qm9_disjoint.py
@@ -27,9 +27,9 @@
 dataset = QM9(amount=1000)  # Set amount=None to train on whole dataset
 
 # Parameters
-F = dataset.F          # Dimension of node features
-S = dataset.S          # Dimension of edge features
-n_out = dataset.n_out  # Dimension of the target
+F = dataset.n_node_features  # Dimension of node features
+S = dataset.n_edge_features  # Dimension of edge features
+n_out = dataset.n_labels     # Dimension of the target
 
 # Train/test split
 idxs = np.random.permutation(len(dataset))
diff --git a/examples/graph_prediction/tud_disjoint.py b/examples/graph_prediction/tud_disjoint.py
index 7d7ab1d7..72f27cca 100644
--- a/examples/graph_prediction/tud_disjoint.py
+++ b/examples/graph_prediction/tud_disjoint.py
@@ -31,8 +31,8 @@
 dataset = TUDataset('PROTEINS', clean=True)
 
 # Parameters
-F = dataset.F          # Dimension of node features
-n_out = dataset.n_out  # Dimension of the target
+F = dataset.n_node_features  # Dimension of node features
+n_out = dataset.n_labels     # Dimension of the target
 
 # Train/test split
 idxs = np.random.permutation(len(dataset))
diff --git a/examples/node_prediction/citation_arma.py b/examples/node_prediction/citation_arma.py
index 0aeb6d82..0d87144d 100644
--- a/examples/node_prediction/citation_arma.py
+++ b/examples/node_prediction/citation_arma.py
@@ -34,9 +34,9 @@
 patience = 100         # Patience for early stopping
 a_dtype = dataset[0].a.dtype  # Only needed for TF 2.1
 
-N = dataset.N          # Number of nodes in the graph
-F = dataset.F          # Original size of node features
-n_out = dataset.n_out  # Number of classes
+N = dataset.n_nodes          # Number of nodes in the graph
+F = dataset.n_node_features  # Original size of node features
+n_out = dataset.n_labels     # Number of classes
 
 # Model definition
 x_in = Input(shape=(F,))
diff --git a/examples/node_prediction/citation_cheby.py b/examples/node_prediction/citation_cheby.py
index 644f5c8d..0482c121 100644
--- a/examples/node_prediction/citation_cheby.py
+++ b/examples/node_prediction/citation_cheby.py
@@ -32,9 +32,9 @@
 patience = 10          # Patience for early stopping
 a_dtype = dataset[0].a.dtype  # Only needed for TF 2.1
 
-N = dataset.N          # Number of nodes in the graph
-F = dataset.F          # Original size of node features
-n_out = dataset.n_out  # Number of classes
+N = dataset.n_nodes          # Number of nodes in the graph
+F = dataset.n_node_features  # Original size of node features
+n_out = dataset.n_labels     # Number of classes
 
 # Model definition
 x_in = Input(shape=(F,))
diff --git a/examples/node_prediction/citation_gat.py b/examples/node_prediction/citation_gat.py
index ea484915..3aab06fc 100644
--- a/examples/node_prediction/citation_gat.py
+++ b/examples/node_prediction/citation_gat.py
@@ -31,9 +31,9 @@
 patience = 100         # Patience for early stopping
 a_dtype = dataset[0].a.dtype  # Only needed for TF 2.1
 
-N = dataset.N          # Number of nodes in the graph
-F = dataset.F          # Original size of node features
-n_out = dataset.n_out  # Number of classes
+N = dataset.n_nodes          # Number of nodes in the graph
+F = dataset.n_node_features  # Original size of node features
+n_out = dataset.n_labels     # Number of classes
 
 # Model definition
 x_in = Input(shape=(F,))
diff --git a/examples/node_prediction/citation_gat_fast.py b/examples/node_prediction/citation_gat_fast.py
index 7a8c2902..02e7b4d1 100644
--- a/examples/node_prediction/citation_gat_fast.py
+++ b/examples/node_prediction/citation_gat_fast.py
@@ -24,7 +24,7 @@
 mask_tr, mask_va, mask_te = dataset.mask_tr, dataset.mask_va, dataset.mask_te
 
 # Define model
-x_in = Input(shape=(dataset.F,))
+x_in = Input(shape=(dataset.n_node_features,))
 a_in = Input(shape=(None,), sparse=True)
 x_1 = Dropout(0.6)(x_in)
 x_1 = GraphAttention(8,
@@ -36,7 +36,7 @@
                      attn_kernel_regularizer=l2(5e-4),
                      bias_regularizer=l2(5e-4))([x_1, a_in])
 x_2 = Dropout(0.6)(x_1)
-x_2 = GraphAttention(dataset.n_out,
+x_2 = GraphAttention(dataset.n_labels,
                      attn_heads=1,
                      concat_heads=True,
                      dropout_rate=0.6,
diff --git a/examples/node_prediction/citation_gcn.py b/examples/node_prediction/citation_gcn.py
index c4e62475..1ef604ae 100644
--- a/examples/node_prediction/citation_gcn.py
+++ b/examples/node_prediction/citation_gcn.py
@@ -15,7 +15,6 @@
 from spektral.datasets.citation import Citation
 from spektral.layers import GraphConv
 from spektral.transforms import LayerPreprocess, AdjToSpTensor
-import tensorflow as tf
 
 # Load data
 dataset = Citation('cora',
@@ -31,9 +30,9 @@
 patience = 10          # Patience for early stopping
 a_dtype = dataset[0].a.dtype  # Only needed for TF 2.1
 
-N = dataset.N          # Number of nodes in the graph
-F = dataset.F          # Original size of node features
-n_out = dataset.n_out  # Number of classes
+N = dataset.n_nodes          # Number of nodes in the graph
+F = dataset.n_node_features  # Original size of node features
+n_out = dataset.n_labels     # Number of classes
 
 # Model definition
 x_in = Input(shape=(F,))
diff --git a/examples/node_prediction/citation_gcn_fast.py b/examples/node_prediction/citation_gcn_fast.py
index 0a3044f2..ee6f8dfa 100644
--- a/examples/node_prediction/citation_gcn_fast.py
+++ b/examples/node_prediction/citation_gcn_fast.py
@@ -23,8 +23,8 @@
 mask_tr, mask_va, mask_te = dataset.mask_tr, dataset.mask_va, dataset.mask_te
 
 # Define model
-x_in = Input(shape=(dataset.F,))
-a_in = Input((dataset.F,), sparse=True)
+x_in = Input(shape=(dataset.n_node_features,))
+a_in = Input((dataset.n_node_features,), sparse=True)
 x_1 = GraphConv(16, 'relu', True, kernel_regularizer=l2(5e-4))([x_in, a_in])
 x_1 = Dropout(0.5)(x_1)
 x_2 = GraphConv(y.shape[1], 'softmax', True)([x_1, a_in])
diff --git a/examples/node_prediction/citation_simple_gc.py b/examples/node_prediction/citation_simple_gc.py
index 70abef4b..f9770c1b 100644
--- a/examples/node_prediction/citation_simple_gc.py
+++ b/examples/node_prediction/citation_simple_gc.py
@@ -47,9 +47,9 @@ def __call__(self, graph):
 patience = 200         # Patience for early stopping
 a_dtype = dataset[0].a.dtype  # Only needed for TF 2.1
 
-N = dataset.N          # Number of nodes in the graph
-F = dataset.F          # Original size of node features
-n_out = dataset.n_out  # Number of classes
+N = dataset.n_nodes          # Number of nodes in the graph
+F = dataset.n_node_features  # Original size of node features
+n_out = dataset.n_labels     # Number of classes
 
 # Model definition
 x_in = Input(shape=(F,))
diff --git a/examples/node_prediction/ogbn-arxiv_gcn.py b/examples/node_prediction/ogbn-arxiv_gcn.py
index b17ac4f1..1d8d3af5 100644
--- a/examples/node_prediction/ogbn-arxiv_gcn.py
+++ b/examples/node_prediction/ogbn-arxiv_gcn.py
@@ -28,8 +28,8 @@
 learning_rate = 1e-2             # Learning rate
 epochs = 200                     # Number of training epochs
 
-N = dataset.N                    # Number of nodes in the graph
-F = dataset.F                    # Original size of node features
+N = dataset.n_nodes              # Number of nodes in the graph
+F = dataset.n_node_features      # Original size of node features
 n_out = ogb_dataset.num_classes  # OGB labels are sparse indices
 
 # Data splits
diff --git a/examples/other/node_clustering_mincut.py b/examples/other/node_clustering_mincut.py
index 62af8d2e..592b78a0 100644
--- a/examples/other/node_clustering_mincut.py
+++ b/examples/other/node_clustering_mincut.py
@@ -42,7 +42,7 @@ def train_step(inputs):
 adj, x, y = dataset[0].a, dataset[0].x, dataset[0].y
 a_norm = normalized_adjacency(adj)
 a_norm = sp_matrix_to_sp_tensor(a_norm)
-F = dataset.F
+F = dataset.n_node_features
 y = np.argmax(y, axis=-1)
 n_clusters = y.max() + 1
 
diff --git a/spektral/data/dataset.py b/spektral/data/dataset.py
index 313a681d..3e8c7cdf 100644
--- a/spektral/data/dataset.py
+++ b/spektral/data/dataset.py
@@ -26,11 +26,11 @@ class Dataset:
     Datasets have the following properties that automatically computed from the
     graphs:
 
-        - `N`: the number of nodes in the dataset (returns `None` if the number
+        - `n_nodes`: the number of nodes in the dataset (returns `None` if the number
         changes between graphs);
-        - `F`: the size of the node features (returns `None` if the size changes
+        - `n_node_features`: the size of the node features (returns `None` if the size changes
         between graphs or is not defined);
-        - `S`: the size of the edge features (returns `None` if the size changes
+        - `n_edge_features`: the size of the edge features (returns `None` if the size changes
         between graphs or is not defined);
         - `n_labels`: the size of the labels (returns `None` if the size changes
         between graphs or is not defined); this is computed as the innermost
@@ -51,11 +51,11 @@ class Dataset:
     - `map(transform, reduce=None)`: returns a list containing the output
     of `transform(graph)` for each graph. If `reduce` is a `callable`, then
     returns `reduce(output_list)` instead of just `output_list`.
-    For instance: `map(lambda: g.N, reduce=np.mean)` will return the average
+    For instance: `map(lambda: g.n_nodes, reduce=np.mean)` will return the average
     number of nodes in the dataset.
     - `filter(function)`: removes from the dataset any graph for which
     `function(graph)` returns `False`.
-    For example: `filter(lambda: g.N < 100)` removes from the dataset all graphs
+    For example: `filter(lambda: g.n_nodes < 100)` removes from the dataset all graphs
     bigger than 100 nodes.
 
     You can extend this class to create your own dataset.
@@ -186,28 +186,28 @@ def path(self):
         return osp.join(DATASET_FOLDER, self.__class__.__name__)
 
     @property
-    def N(self):
-        if len(self.graphs) == 1 or len(set([g.N for g in self.graphs])) == 1:
-            return self.graphs[0].N
+    def n_nodes(self):
+        if len(self.graphs) == 1 or len(set([g.n_nodes for g in self.graphs])) == 1:
+            return self.graphs[0].n_nodes
         else:
             return None
 
     @property
-    def F(self):
-        if len(self.graphs) == 1 or len(set([g.F for g in self.graphs])) == 1:
-            return self.graphs[0].F
+    def n_node_features(self):
+        if len(self.graphs) == 1 or len(set([g.n_node_features for g in self.graphs])) == 1:
+            return self.graphs[0].n_node_features
         else:
             return None
 
     @property
-    def S(self):
-        if len(self.graphs) == 1 or len(set([g.S for g in self.graphs])) == 1:
-            return self.graphs[0].S
+    def n_edge_features(self):
+        if len(self.graphs) == 1 or len(set([g.n_edge_features for g in self.graphs])) == 1:
+            return self.graphs[0].n_edge_features
         else:
             return None
 
     @property
-    def n_out(self):
+    def n_labels(self):
         if len(self.graphs) == 1 or len(set([g.n_labels for g in self.graphs])) == 1:
             return self.graphs[0].n_labels
         else:
@@ -233,7 +233,7 @@ def signature(self):
         if graph.x is not None:
             signature['x'] = dict()
             signature['x']['spec'] = get_spec(graph.x)
-            signature['x']['shape'] = (None, self.F)
+            signature['x']['shape'] = (None, self.n_node_features)
             signature['x']['dtype'] = tf.as_dtype(graph.x.dtype)
         if graph.a is not None:
             signature['a'] = dict()
@@ -243,11 +243,11 @@ def signature(self):
         if graph.e is not None:
             signature['e'] = dict()
             signature['e']['spec'] = get_spec(graph.e)
-            signature['e']['shape'] = (None, self.S)
+            signature['e']['shape'] = (None, self.n_edge_features)
             signature['e']['dtype'] = tf.as_dtype(graph.e.dtype)
         if graph.y is not None:
             signature['y'] = dict()
             signature['y']['spec'] = get_spec(graph.y)
-            signature['y']['shape'] = (self.n_out, )
+            signature['y']['shape'] = (self.n_labels,)
             signature['y']['dtype'] = tf.as_dtype(np.array(graph.y).dtype)
         return signature
diff --git a/spektral/data/graph.py b/spektral/data/graph.py
index 10c18af0..32dc4d3c 100644
--- a/spektral/data/graph.py
+++ b/spektral/data/graph.py
@@ -1,4 +1,5 @@
 import numpy as np
+import scipy.sparse as sp
 
 
 class Graph:
@@ -19,9 +20,10 @@ class Graph:
     Graphs also have the following attributes that are computed automatically
     from the data:
 
-    - `N`: number of nodes;
-    - `F`: size of the node features, if available;
-    - `S`: size of the edge features, if available;
+    - `n_nodes`: number of nodes;
+    - `n_edges`: number of edges;
+    - `n_node_features`: size of the node features, if available;
+    - `n_edge_features`: size of the edge features, if available;
     - `n_labels`: size of the labels, if available;
 
     Any additional `kwargs` passed to the constructor will be automatically
@@ -32,34 +34,28 @@ class Graph:
 
     Spektral usually assumes that the different data matrices have specific
     shapes, although this is not strictly enforced to allow more flexibility.
-    In general, node attributes should have shape `(N, F)` and the adjacency
-    matrix should have shape `(N, N)`.
-
-    A Graph should always have either the node features or the adjacency matrix.
-    Empty graphs are not supported.
+    In general, node attributes should have shape `(n_nodes, n_node_features)` and the adjacency
+    matrix should have shape `(n_nodes, n_nodes)`.
 
     Edge attributes can be stored in a dense format as arrays of shape
-    `(N, N, S)` or in a sparse format as arrays of shape `(n_edges, S)`
+    `(n_nodes, n_nodes, n_edge_features)` or in a sparse format as arrays of shape `(n_edges, n_edge_features)`
     (so that you don't have to store all the zeros for missing edges). Most
     components of Spektral will know how to deal with both situations
     automatically.
 
     Labels can refer to the entire graph (shape `(n_labels, )`) or to each
-    individual node (shape `(N, n_labels)`).
+    individual node (shape `(n_nodes, n_labels)`).
 
     **Arguments**
 
-    - `x`: np.array, the node features (shape `(N, F)`);
-    - `a`: np.array or scipy.sparse matrix, the adjacency matrix (shape `(N, N)`);
-    - `e`: np.array, the edge features (shape `(N, N, S)` or `(n_edges, S)`);
-    - `y`: np.array, the node or graph labels (shape `(N, n_labels)` or `(n_labels, )`);
+    - `x`: np.array, the node features (shape `(n_nodes, n_node_features)`);
+    - `a`: np.array or scipy.sparse matrix, the adjacency matrix (shape `(n_nodes, n_nodes)`);
+    - `e`: np.array, the edge features (shape `(n_nodes, n_nodes, n_edge_features)` or `(n_edges, n_edge_features)`);
+    - `y`: np.array, the node or graph labels (shape `(n_nodes, n_labels)` or `(n_labels, )`);
 
 
     """
     def __init__(self, x=None, a=None, e=None, y=None, **kwargs):
-        if x is None and a is None:
-            raise ValueError('A Graph should have either node attributes or '
-                             'an adjacency matrix. Got both None.')
         self.x = x
         self.a = a
         self.e = e
@@ -83,11 +79,11 @@ def __contains__(self, key):
         return key in self.keys
 
     def __repr__(self):
-        return 'Graph(N={}, F={}, S={}, y={})'\
-               .format(self.N, self.F, self.S, self.y)
+        return 'Graph(n_nodes={}, n_node_features={}, n_edge_features={}, y={})'\
+               .format(self.n_nodes, self.n_node_features, self.n_edge_features, self.y)
 
     @property
-    def N(self):
+    def n_nodes(self):
         if self.x is not None:
             return self.x.shape[-2]
         elif self.a is not None:
@@ -96,14 +92,23 @@ def N(self):
             return None
 
     @property
-    def F(self):
+    def n_edges(self):
+        if sp.issparse(self.a):
+            return self.a.nnz
+        elif isinstance(self.a, np.ndarray):
+            return np.nonzero(self.a)
+        else:
+            return None
+
+    @property
+    def n_node_features(self):
         if self.x is not None:
             return self.x.shape[-1]
         else:
             return None
 
     @property
-    def S(self):
+    def n_edge_features(self):
         if self.e is not None:
             return self.e.shape[-1]
         else:
diff --git a/spektral/data/loaders.py b/spektral/data/loaders.py
index 6dabc9bc..bbf2b3d8 100644
--- a/spektral/data/loaders.py
+++ b/spektral/data/loaders.py
@@ -201,7 +201,7 @@ class DisjointLoader(Loader):
     The adjacency matrix will always be returned as a SparseTensor, regardless
     of the input.
     Edge attributes will be returned as a sparse edge list of shape
-    `(n_edges, S)`.
+    `(n_edges, n_edge_features)`.
 
     If `node_level=False`, the labels are interpreted as graph-level labels and
     are stacked along an additional dimension (i.e., `(n_graphs, n_labels)`)
@@ -226,7 +226,7 @@ class DisjointLoader(Loader):
     - `x`: node attributes stacked along the outermost dimension;
     - `a`: SparseTensor, the block-diagonal matrix obtained from the adjacency
     matrices of the batch;
-    - `e`: edge attributes as edge list of shape `(n_edges, S)`;
+    - `e`: edge attributes as edge list of shape `(n_edges, n_edge_features)`;
 
     If `node_level=False`, `labels` has shape `(n_graphs, n_labels)`;
     If `node_level=True`, then the labels are stacked vertically, i.e.,
@@ -282,7 +282,7 @@ class BatchLoader(Loader):
     If `n_max` is the number of nodes of the biggest graph in the batch, then
     the padding consist of adding zeros to the node features, adjacency matrix,
     and edge attributes of each graph so that they have shapes
-    `(n_max, F)`, `(n_max, n_max)`, and `(n_max, n_max, S)` respectively.
+    `(n_max, n_node_features)`, `(n_max, n_max)`, and `(n_max, n_max, n_edge_features)` respectively.
 
     The zero-padding is done batch-wise, which saves up memory at the cost of
     more computation. If latency is an issue but memory isn't, or if the
@@ -312,11 +312,11 @@ class BatchLoader(Loader):
     `inputs` is a tuple containing:
 
     - `x`: node attributes, zero-padded and stacked along an extra dimension
-    (shape `(n_graphs, n_max, F)`);
+    (shape `(n_graphs, n_max, n_node_features)`);
     - `a`: adjacency matrices (dense), zero-padded and stacked along an extra
     dimension (shape `(n_graphs, n_max, n_max)`);
     - `e`: edge attributes (dense), zero-padded and stacked along an extra
-    dimension (shape `(n_graphs, n_max, n_max, S)`).
+    dimension (shape `(n_graphs, n_max, n_max, n_edge_features)`).
 
     `labels` are also stacked along an extra dimension.
 
diff --git a/spektral/data/utils.py b/spektral/data/utils.py
index ad7b0edb..f66569b1 100644
--- a/spektral/data/utils.py
+++ b/spektral/data/utils.py
@@ -26,20 +26,20 @@ def to_disjoint(x_list, a_list, e_list=None):
 
     The edge attributes of a graph can be represented as
 
-    - a dense array of shape `(N, N, S)`;
-    - a sparse edge list of shape `(n_edges, S)`;
+    - a dense array of shape `(n_nodes, n_nodes, n_edge_features)`;
+    - a sparse edge list of shape `(n_edges, n_edge_features)`;
 
     and they will always be returned as edge list for efficiency.
 
-    :param x_list: a list of np.arrays of shape `(N, F)` -- note that `N` can
+    :param x_list: a list of np.arrays of shape `(n_nodes, n_node_features)` -- note that `n_nodes` can
     change between graphs;
     :param a_list: a list of np.arrays or scipy.sparse matrices of shape
-    `(N, N)`;
-    :param e_list: a list of np.arrays of shape `(N, N, S)` or `(n_edges, S)`;
+    `(n_nodes, n_nodes)`;
+    :param e_list: a list of np.arrays of shape `(n_nodes, n_nodes, n_edge_features)` or `(n_edges, n_edge_features)`;
     :return:
-        -  `x`: np.array of shape `(n_nodes, F)`;
+        -  `x`: np.array of shape `(n_nodes, n_node_features)`;
         -  `a`: scipy.sparse matrix of shape `(n_nodes, n_nodes)`;
-        -  `e`: (optional) np.array of shape `(n_edges, S)`;
+        -  `e`: (optional) np.array of shape `(n_edges, n_edge_features)`;
         -  `i`: np.array of shape `(n_nodes, )`;
     """
     _check_input(x_list, a_list, e_list)
@@ -77,21 +77,21 @@ def to_batch(x_list, a_list, e_list=None):
 
     The edge attributes of a graph can be represented as
 
-    - a dense array of shape `(N, N, S)`;
-    - a sparse edge list of shape `(n_edges, S)`;
+    - a dense array of shape `(n_nodes, n_nodes, n_edge_features)`;
+    - a sparse edge list of shape `(n_edges, n_edge_features)`;
 
     and they will always be returned as dense arrays.
 
-    :param x_list: a list of np.arrays of shape `(N, F)` -- note that `N` can
+    :param x_list: a list of np.arrays of shape `(n_nodes, n_node_features)` -- note that `n_nodes` can
     change between graphs;
     :param a_list: a list of np.arrays or scipy.sparse matrices of shape
-    `(N, N)`;
-    :param e_list: a list of np.arrays of shape `(N, N, S)`;
+    `(n_nodes, n_nodes)`;
+    :param e_list: a list of np.arrays of shape `(n_nodes, n_nodes, n_edge_features)`;
     :return:
-        -  `x`: np.array of shape `(batch, n_max, F)`;
+        -  `x`: np.array of shape `(batch, n_max, n_node_features)`;
         -  `a`: np.array of shape `(batch, n_max, n_max)`;
         -  `e`: (only if `e_list` is given) np.array of shape
-        `(batch, n_max, n_max, S)`;
+        `(batch, n_max, n_max, n_edge_features)`;
     """
     _check_input(x_list, a_list, e_list)
     n_max = max([a.shape[-1] for a in a_list])
diff --git a/spektral/layers/base.py b/spektral/layers/base.py
index a680cc14..755e0a8c 100644
--- a/spektral/layers/base.py
+++ b/spektral/layers/base.py
@@ -241,13 +241,13 @@ class Disjoint2Batch(Layer):
 
     **Input**
 
-    - Node features of shape `(N, F)`;
-    - Binary adjacency matrix of shape `(N, N)`;
-    - Graph IDs of shape `(N, )`;
+    - Node features of shape `(n_nodes, n_node_features)`;
+    - Binary adjacency matrix of shape `(n_nodes, n_nodes)`;
+    - Graph IDs of shape `(n_nodes, )`;
 
     **Output**
 
-    - Batched node features of shape `(batch, N_max, F)`;
+    - Batched node features of shape `(batch, N_max, n_node_features)`;
     - Batched adjacency matrix of shape `(batch, N_max, N_max)`;
     """
 
diff --git a/spektral/layers/convolutional/agnn_conv.py b/spektral/layers/convolutional/agnn_conv.py
index b7b3a7bb..621d6528 100644
--- a/spektral/layers/convolutional/agnn_conv.py
+++ b/spektral/layers/convolutional/agnn_conv.py
@@ -31,8 +31,8 @@ class AGNNConv(MessagePassing):
 
     **Input**
 
-    - Node features of shape `(N, F)`;
-    - Binary adjacency matrix of shape `(N, N)`.
+    - Node features of shape `(n_nodes, n_node_features)`;
+    - Binary adjacency matrix of shape `(n_nodes, n_nodes)`.
 
     **Output**
 
diff --git a/spektral/layers/convolutional/appnp.py b/spektral/layers/convolutional/appnp.py
index a5e16208..b5cb4a38 100644
--- a/spektral/layers/convolutional/appnp.py
+++ b/spektral/layers/convolutional/appnp.py
@@ -24,8 +24,8 @@ class APPNP(GraphConv):
 
     **Input**
 
-    - Node features of shape `([batch], N, F)`;
-    - Modified Laplacian of shape `([batch], N, N)`; can be computed with
+    - Node features of shape `([batch], n_nodes, n_node_features)`;
+    - Modified Laplacian of shape `([batch], n_nodes, n_nodes)`; can be computed with
     `spektral.utils.convolution.gcn_filter`.
 
     **Output**
diff --git a/spektral/layers/convolutional/arma_conv.py b/spektral/layers/convolutional/arma_conv.py
index 2283b9c6..73c56a40 100644
--- a/spektral/layers/convolutional/arma_conv.py
+++ b/spektral/layers/convolutional/arma_conv.py
@@ -32,8 +32,8 @@ class ARMAConv(GraphConv):
 
     **Input**
 
-    - Node features of shape `([batch], N, F)`;
-    - Normalized and rescaled Laplacian of shape `([batch], N, N)`; can be
+    - Node features of shape `([batch], n_nodes, n_node_features)`;
+    - Normalized and rescaled Laplacian of shape `([batch], n_nodes, n_nodes)`; can be
     computed with `spektral.utils.convolution.normalized_laplacian` and
     `spektral.utils.convolution.rescale_laplacian`.
 
diff --git a/spektral/layers/convolutional/cheb_conv.py b/spektral/layers/convolutional/cheb_conv.py
index 51f7cf93..d4739f72 100644
--- a/spektral/layers/convolutional/cheb_conv.py
+++ b/spektral/layers/convolutional/cheb_conv.py
@@ -31,9 +31,9 @@ class ChebConv(GraphConv):
 
     **Input**
 
-    - Node features of shape `([batch], N, F)`;
+    - Node features of shape `([batch], n_nodes, n_node_features)`;
     - A list of K Chebyshev polynomials of shape
-    `[([batch], N, N), ..., ([batch], N, N)]`; can be computed with
+    `[([batch], n_nodes, n_nodes), ..., ([batch], n_nodes, n_nodes)]`; can be computed with
     `spektral.utils.convolution.chebyshev_filter`.
 
     **Output**
diff --git a/spektral/layers/convolutional/crystal_conv.py b/spektral/layers/convolutional/crystal_conv.py
index bc1cdd8c..3a1eca47 100644
--- a/spektral/layers/convolutional/crystal_conv.py
+++ b/spektral/layers/convolutional/crystal_conv.py
@@ -27,9 +27,9 @@ class CrystalConv(MessagePassing):
 
     **Input**
 
-    - Node features of shape `(N, F)`;
-    - Binary adjacency matrix of shape `(N, N)`.
-    - Edge features of shape `(num_edges, S)`.
+    - Node features of shape `(n_nodes, n_node_features)`;
+    - Binary adjacency matrix of shape `(n_nodes, n_nodes)`.
+    - Edge features of shape `(num_edges, n_edge_features)`.
 
     **Output**
 
diff --git a/spektral/layers/convolutional/diffusion_conv.py b/spektral/layers/convolutional/diffusion_conv.py
index d9ce1240..477247fb 100644
--- a/spektral/layers/convolutional/diffusion_conv.py
+++ b/spektral/layers/convolutional/diffusion_conv.py
@@ -11,9 +11,9 @@ class DiffuseFeatures(layers.Layer):
 
     **Input**
 
-    - Node features of shape `([batch], N, F)`;
+    - Node features of shape `([batch], n_nodes, n_node_features)`;
     - Normalized adjacency or attention coef. matrix \(\hat \A \) of shape
-    `([batch], N, N)`; Use DiffusionConvolution.preprocess to normalize.
+    `([batch], n_nodes, n_nodes)`; Use DiffusionConvolution.preprocess to normalize.
 
     **Output**
 
@@ -68,7 +68,7 @@ def call(self, inputs):
         # unstack kernel
         diffusion_matrix = tf.math.polyval(tf.unstack(self.kernel), A)
 
-        # Apply it to X to get a matrix C = [C_1, ..., C_F] (N x F)
+        # Apply it to X to get a matrix C = [C_1, ..., C_F] (n_nodes x n_node_features)
         # of diffused features
         diffused_features = tf.matmul(diffusion_matrix, X)
 
@@ -76,8 +76,8 @@ def call(self, inputs):
         # and apply a non linearity to obtain H:,q (eq. 3 in paper)
         H = tf.math.reduce_sum(diffused_features, axis=-1)
 
-        # H has shape ([batch], N) but as it is the sum of columns
-        # we reshape it to ([batch], N, 1)
+        # H has shape ([batch], n_nodes) but as it is the sum of columns
+        # we reshape it to ([batch], n_nodes, 1)
         return tf.expand_dims(H, -1)
 
 
@@ -93,7 +93,7 @@ class DiffusionConv(GraphConv):
     this layer calculates the q'th channel as:
     $$
     \mathbf{H}_{~:,~q} = \sigma\left(
-        \sum_{f=1}^{F}
+        \sum_{f=1}^{n_node_features}
             \left(
                 \sum_{k=0}^{K-1}\theta_k {\hat \A}^k
             \right)
@@ -103,9 +103,9 @@ class DiffusionConv(GraphConv):
 
     **Input**
 
-    - Node features of shape `([batch], N, F)`;
+    - Node features of shape `([batch], n_nodes, n_node_features)`;
     - Normalized adjacency or attention coef. matrix \(\hat \A \) of shape
-    `([batch], N, N)`; Use `DiffusionConvolution.preprocess` to normalize.
+    `([batch], n_nodes, n_nodes)`; Use `DiffusionConvolution.preprocess` to normalize.
 
     **Output**
 
@@ -149,8 +149,8 @@ def __init__(
     def build(self, input_shape):
 
         # We expect to receive (X, A)
-        # A - Adjacency ([batch], N, N)
-        # X - graph signal ([batch], N, F)
+        # A - Adjacency ([batch], n_nodes, n_nodes)
+        # X - graph signal ([batch], n_nodes, n_node_features)
         X_shape, A_shape = input_shape
 
         # initialise Q diffusion convolution filters
@@ -167,14 +167,14 @@ def build(self, input_shape):
 
     def apply_filters(self, X, A):
         """Applies diffusion convolution self.Q times to get a
-        ([batch], N, Q) diffused graph signal
+        ([batch], n_nodes, Q) diffused graph signal
 
         """
 
         # This will be a list of Q diffused features.
-        # Each diffused feature is a (batch, N, 1) tensor.
+        # Each diffused feature is a (batch, n_nodes, 1) tensor.
         # Later we will concat all the features to get one
-        # (batch, N, Q) diffused graph signal
+        # (batch, n_nodes, Q) diffused graph signal
         diffused_features = []
 
         # Iterating over all Q diffusion filters
@@ -182,7 +182,7 @@ def apply_filters(self, X, A):
             diffused_feature = diffusion((X, A))
             diffused_features.append(diffused_feature)
 
-        # Concat them into ([batch], N, Q) diffused graph signal
+        # Concat them into ([batch], n_nodes, Q) diffused graph signal
         H = tf.concat(diffused_features, -1)
 
         return H
diff --git a/spektral/layers/convolutional/ecc_conv.py b/spektral/layers/convolutional/ecc_conv.py
index 39121d59..70ef70c3 100644
--- a/spektral/layers/convolutional/ecc_conv.py
+++ b/spektral/layers/convolutional/ecc_conv.py
@@ -28,10 +28,10 @@ class EdgeConditionedConv(GraphConv):
 
     **Input**
 
-    - Node features of shape `([batch], N, F)`;
-    - Binary adjacency matrices of shape `([batch], N, N)`;
-    - Edge features. In single mode, shape `(num_edges, S)`; in batch mode, shape
-    `(batch, N, N, S)`.
+    - Node features of shape `([batch], n_nodes, n_node_features)`;
+    - Binary adjacency matrices of shape `([batch], n_nodes, n_nodes)`;
+    - Edge features. In single mode, shape `(num_edges, n_edge_features)`; in batch mode, shape
+    `(batch, n_nodes, n_nodes, n_edge_features)`.
 
     **Output**
 
@@ -124,9 +124,9 @@ def build(self, input_shape):
         self.built = True
 
     def call(self, inputs):
-        X = inputs[0]  # (batch_size, N, F)
-        A = inputs[1]  # (batch_size, N, N)
-        E = inputs[2]  # (n_edges, S) or (batch_size, N, N, S)
+        X = inputs[0]  # (batch_size, n_nodes, n_node_features)
+        A = inputs[1]  # (batch_size, n_nodes, n_nodes)
+        E = inputs[2]  # (n_edges, n_edge_features) or (batch_size, n_nodes, n_nodes, n_edge_features)
 
         mode = ops.autodetect_mode(A, X)
         if mode == modes.SINGLE:
@@ -158,10 +158,10 @@ def call(self, inputs):
         return output
 
     def _call_single(self, inputs):
-        X = inputs[0]  # (N, F)
-        A = inputs[1]  # (N, N)
-        E = inputs[2]  # (n_edges, S)
-        assert K.ndim(E) == 2, 'In single mode, E must have shape (n_edges, S).'
+        X = inputs[0]  # (n_nodes, F)
+        A = inputs[1]  # (n_nodes, n_nodes)
+        E = inputs[2]  # (n_edges, n_edge_features)
+        assert K.ndim(E) == 2, 'In single mode, E must have shape (n_edges, n_edge_features).'
 
         # Enforce sparse representation
         if not K.is_sparse(A):
diff --git a/spektral/layers/convolutional/edge_conv.py b/spektral/layers/convolutional/edge_conv.py
index 4befd20e..6af51eaa 100644
--- a/spektral/layers/convolutional/edge_conv.py
+++ b/spektral/layers/convolutional/edge_conv.py
@@ -22,8 +22,8 @@ class EdgeConv(MessagePassing):
 
     **Input**
 
-    - Node features of shape `(N, F)`;
-    - Binary adjacency matrix of shape `(N, N)`.
+    - Node features of shape `(n_nodes, n_node_features)`;
+    - Binary adjacency matrix of shape `(n_nodes, n_nodes)`.
 
     **Output**
 
diff --git a/spektral/layers/convolutional/gated_graph_conv.py b/spektral/layers/convolutional/gated_graph_conv.py
index de53d334..92d0a4e6 100644
--- a/spektral/layers/convolutional/gated_graph_conv.py
+++ b/spektral/layers/convolutional/gated_graph_conv.py
@@ -27,9 +27,9 @@ class GatedGraphConv(MessagePassing):
 
     **Input**
 
-    - Node features of shape `(N, F)`; note that `F` must be smaller or equal
+    - Node features of shape `(n_nodes, n_node_features)`; note that `n_node_features` must be smaller or equal
     than `channels`.
-    - Binary adjacency matrix of shape `(N, N)`.
+    - Binary adjacency matrix of shape `(n_nodes, n_nodes)`.
 
     **Output**
 
diff --git a/spektral/layers/convolutional/gin_conv.py b/spektral/layers/convolutional/gin_conv.py
index 1b9c9633..fb27a769 100644
--- a/spektral/layers/convolutional/gin_conv.py
+++ b/spektral/layers/convolutional/gin_conv.py
@@ -22,8 +22,8 @@ class GINConv(MessagePassing):
 
     **Input**
 
-    - Node features of shape `(N, F)`;
-    - Binary adjacency matrix of shape `(N, N)`.
+    - Node features of shape `(n_nodes, n_node_features)`;
+    - Binary adjacency matrix of shape `(n_nodes, n_nodes)`.
 
     **Output**
 
diff --git a/spektral/layers/convolutional/graph_attention.py b/spektral/layers/convolutional/graph_attention.py
index 1706711f..eacaa904 100644
--- a/spektral/layers/convolutional/graph_attention.py
+++ b/spektral/layers/convolutional/graph_attention.py
@@ -48,8 +48,8 @@ class GraphAttention(GraphConv):
 
     **Input**
 
-    - Node features of shape `([batch], N, F)`;
-    - Binary adjacency matrix of shape `([batch], N, N)`;
+    - Node features of shape `([batch], n_nodes, n_node_features)`;
+    - Binary adjacency matrix of shape `([batch], n_nodes, n_nodes)`;
 
     **Output**
 
@@ -57,7 +57,7 @@ class GraphAttention(GraphConv):
     dimension changed to `channels`;
     - if `return_attn_coef=True`, a list with the attention coefficients for
     each attention head. Each attention coefficient matrix has shape
-    `([batch], N, N)`.
+    `([batch], n_nodes, n_nodes)`.
 
     **Arguments**
 
@@ -67,7 +67,7 @@ class GraphAttention(GraphConv):
      heads instead of averaging;
     - `dropout_rate`: internal dropout rate for attention coefficients;
     - `return_attn_coef`: if True, return the attention coefficients for
-    the given input (one N x N matrix for each head).
+    the given input (one n_nodes x n_nodes matrix for each head).
     - `activation`: activation function to use;
     - `use_bias`: bool, add a bias vector to the output;
     - `kernel_initializer`: initializer for the weights;
diff --git a/spektral/layers/convolutional/graph_conv.py b/spektral/layers/convolutional/graph_conv.py
index 0a77eb66..255284e0 100644
--- a/spektral/layers/convolutional/graph_conv.py
+++ b/spektral/layers/convolutional/graph_conv.py
@@ -22,8 +22,8 @@ class GraphConv(Layer):
 
     **Input**
 
-    - Node features of shape `([batch], N, F)`;
-    - Modified Laplacian of shape `([batch], N, N)`; can be computed with
+    - Node features of shape `([batch], n_nodes, n_node_features)`;
+    - Modified Laplacian of shape `([batch], n_nodes, n_nodes)`; can be computed with
     `spektral.utils.convolution.gcn_filter`.
 
     **Output**
diff --git a/spektral/layers/convolutional/graph_conv_skip.py b/spektral/layers/convolutional/graph_conv_skip.py
index a1713e5b..fa40495b 100644
--- a/spektral/layers/convolutional/graph_conv_skip.py
+++ b/spektral/layers/convolutional/graph_conv_skip.py
@@ -19,8 +19,8 @@ class GraphConvSkip(GraphConv):
 
     **Input**
 
-    - Node features of shape `([batch], N, F)`;
-    - Normalized adjacency matrix of shape `([batch], N, N)`; can be computed
+    - Node features of shape `([batch], n_nodes, n_node_features)`;
+    - Normalized adjacency matrix of shape `([batch], n_nodes, n_nodes)`; can be computed
     with `spektral.utils.convolution.normalized_adjacency`.
 
     **Output**
diff --git a/spektral/layers/convolutional/graphsage_conv.py b/spektral/layers/convolutional/graphsage_conv.py
index 71161fe0..00700791 100644
--- a/spektral/layers/convolutional/graphsage_conv.py
+++ b/spektral/layers/convolutional/graphsage_conv.py
@@ -23,8 +23,8 @@ class GraphSageConv(GraphConv):
 
     **Input**
 
-    - Node features of shape `(N, F)`;
-    - Binary adjacency matrix of shape `(N, N)`.
+    - Node features of shape `(n_nodes, n_node_features)`;
+    - Binary adjacency matrix of shape `(n_nodes, n_nodes)`.
 
     **Output**
 
diff --git a/spektral/layers/convolutional/message_passing.py b/spektral/layers/convolutional/message_passing.py
index 3d73a27a..1ce7124b 100644
--- a/spektral/layers/convolutional/message_passing.py
+++ b/spektral/layers/convolutional/message_passing.py
@@ -60,8 +60,8 @@ class MessagePassing(Layer):
     - `aggregate`: string or callable, an aggregate function. This flag can be
     used to control the behaviour of `aggregate()` wihtout re-implementing it.
     Supported aggregations: 'sum', 'mean', 'max', 'min', 'prod'.
-    If callable, the function must have the signature `foo(updates, indices, N)`
-    and return a rank 2 tensor with shape `(N, ...)`.
+    If callable, the function must have the signature `foo(updates, indices, n_nodes)`
+    and return a rank 2 tensor with shape `(n_nodes, ...)`.
     """
     def __init__(self, aggregate='sum', **kwargs):
         super().__init__(**{k: v for k, v in kwargs.items() if is_keras_kwarg(k)})
diff --git a/spektral/layers/convolutional/tag_conv.py b/spektral/layers/convolutional/tag_conv.py
index 3a2aaf98..c935f3a9 100644
--- a/spektral/layers/convolutional/tag_conv.py
+++ b/spektral/layers/convolutional/tag_conv.py
@@ -21,8 +21,8 @@ class TAGConv(MessagePassing):
 
     **Input**
 
-    - Node features of shape `(N, F)`;
-    - Binary adjacency matrix of shape `(N, N)`.
+    - Node features of shape `(n_nodes, n_node_features)`;
+    - Binary adjacency matrix of shape `(n_nodes, n_nodes)`.
 
     **Output**
 
diff --git a/spektral/layers/ops/modes.py b/spektral/layers/ops/modes.py
index 1fd2d13a..f344a09d 100644
--- a/spektral/layers/ops/modes.py
+++ b/spektral/layers/ops/modes.py
@@ -13,8 +13,8 @@ def disjoint_signal_to_batch(X, I):
     Converts a disjoint graph signal to batch node by zero-padding.
 
     :param X: Tensor, node features of shape (nodes, features).
-    :param I: Tensor, graph IDs of shape `(N, )`;
-    :return batch: Tensor, batched node features of shape (batch, N_max, F)
+    :param I: Tensor, graph IDs of shape `(n_nodes, )`;
+    :return batch: Tensor, batched node features of shape (batch, N_max, n_node_features)
     """
     I = tf.cast(I, tf.int32)
     num_nodes = tf.math.segment_sum(tf.ones_like(I), I)
@@ -58,8 +58,8 @@ def disjoint_adjacency_to_batch(A, I):
     """
     Converts a disjoint adjacency matrix to batch node by zero-padding.
 
-    :param A: Tensor, binary adjacency matrix of shape `(N, N)`;
-    :param I: Tensor, graph IDs of shape `(N, )`;
+    :param A: Tensor, binary adjacency matrix of shape `(n_nodes, n_nodes)`;
+    :param I: Tensor, graph IDs of shape `(n_nodes, )`;
     :return: Tensor, batched adjacency matrix of shape `(batch, N_max, N_max)`;
     """
     I = tf.cast(I, tf.int64)
diff --git a/spektral/layers/ops/scatter.py b/spektral/layers/ops/scatter.py
index 87575f5a..b1fe509e 100644
--- a/spektral/layers/ops/scatter.py
+++ b/spektral/layers/ops/scatter.py
@@ -4,14 +4,14 @@
 def scatter_sum(updates, indices, N):
     """
     Sums updates along the first dimensions according to the indices, returns
-    a Tensor of the same rank as updates with shape `(N, ...)`.
+    a Tensor of the same rank as updates with shape `(n_nodes, ...)`.
     If the result is empty for a given index `i`, `output[i] = 0`.
     If a given index`i` is negative, the value is ignored.
     :param updates: a Tensor.
     :param indices: A Tensor with indices to index the updates.
     :param N: first dimension the output (i.e., total number of segments).
     :return: a Tensor with the same rank as updates, of shape
-    `(N, ) + updates.shape[1:]`.
+    `(n_nodes, ) + updates.shape[1:]`.
     """
     return tf.math.unsorted_segment_sum(updates, indices, N)
 
@@ -19,14 +19,14 @@ def scatter_sum(updates, indices, N):
 def scatter_mean(updates, indices, N):
     """
     Averages updates along the first dimensions according to the indices,
-    returns a Tensor of the same rank as updates with shape `(N, ...)`.
+    returns a Tensor of the same rank as updates with shape `(n_nodes, ...)`.
     If the result is empty for a given index `i`, `output[i] = 0`.
     If a given index`i` is negative, the value is ignored.
     :param updates: a Tensor.
     :param indices: A Tensor with indices to index the updates.
     :param N: first dimension the output (i.e., total number of segments).
     :return: a Tensor with the same rank as updates, of shape
-    `(N, ) + updates.shape[1:]`.
+    `(n_nodes, ) + updates.shape[1:]`.
     """
     return tf.math.unsorted_segment_mean(updates, indices, N)
 
@@ -38,14 +38,14 @@ def scatter_mean(updates, indices, N):
 def scatter_max(updates, indices, N):
     """
     Max-reduces updates along the first dimensions according to the indices,
-    returns a Tensor of the same rank as updates with shape `(N, ...)`.
+    returns a Tensor of the same rank as updates with shape `(n_nodes, ...)`.
     If the result is empty for a given index `i`, `output[i] = 0`.
     If a given index`i` is negative, the value is ignored.
     :param updates: a Tensor.
     :param indices: A Tensor with indices to index the updates.
     :param N: first dimension the output (i.e., total number of segments).
     :return: a Tensor with the same rank as updates, of shape
-    `(N, ) + updates.shape[1:]`.
+    `(n_nodes, ) + updates.shape[1:]`.
     """
     return tf.math.unsorted_segment_max(updates, indices, N)
 
@@ -53,14 +53,14 @@ def scatter_max(updates, indices, N):
 def scatter_min(updates, indices, N):
     """
     Min-reduces updates along the first dimensions according to the indices,
-    returns a Tensor of the same rank as updates with shape `(N, ...)`.
+    returns a Tensor of the same rank as updates with shape `(n_nodes, ...)`.
     If the result is empty for a given index `i`, `output[i] = 0`.
     If a given index`i` is negative, the value is ignored.
     :param updates: a Tensor.
     :param indices: A Tensor with indices to index the updates.
     :param N: first dimension the output (i.e., total number of segments).
     :return: a Tensor with the same rank as updates, of shape
-    `(N, ) + updates.shape[1:]`.
+    `(n_nodes, ) + updates.shape[1:]`.
     """
     return tf.math.unsorted_segment_min(updates, indices, N)
 
@@ -68,14 +68,14 @@ def scatter_min(updates, indices, N):
 def scatter_prod(updates, indices, N):
     """
     Multiplies updates along the first dimensions according to the indices,
-    returns a Tensor of the same rank as updates with shape `(N, ...)`.
+    returns a Tensor of the same rank as updates with shape `(n_nodes, ...)`.
     If the result is empty for a given index `i`, `output[i] = 0`.
     If a given index`i` is negative, the value is ignored.
     :param updates: a Tensor.
     :param indices: A Tensor with indices to index the updates.
     :param N: first dimension the output (i.e., total number of segments).
     :return: a Tensor with the same rank as updates, of shape
-    `(N, ) + updates.shape[1:]`.
+    `(n_nodes, ) + updates.shape[1:]`.
     """
     return tf.math.unsorted_segment_prod(updates, indices, N)
 
diff --git a/spektral/layers/pooling/diff_pool.py b/spektral/layers/pooling/diff_pool.py
index 868ed0bd..398a9191 100644
--- a/spektral/layers/pooling/diff_pool.py
+++ b/spektral/layers/pooling/diff_pool.py
@@ -41,14 +41,14 @@ class DiffPool(Layer):
 
     **Input**
 
-    - Node features of shape `([batch], N, F)`;
-    - Binary adjacency matrix of shape `([batch], N, N)`;
+    - Node features of shape `([batch], n_nodes, n_node_features)`;
+    - Binary adjacency matrix of shape `([batch], n_nodes, n_nodes)`;
 
     **Output**
 
     - Reduced node features of shape `([batch], K, channels)`;
     - Reduced adjacency matrix of shape `([batch], K, K)`;
-    - If `return_mask=True`, the soft clustering matrix of shape `([batch], N, K)`.
+    - If `return_mask=True`, the soft clustering matrix of shape `([batch], n_nodes, K)`.
 
     **Arguments**
 
diff --git a/spektral/layers/pooling/global_pool.py b/spektral/layers/pooling/global_pool.py
index 35c05680..7050eae5 100644
--- a/spektral/layers/pooling/global_pool.py
+++ b/spektral/layers/pooling/global_pool.py
@@ -59,13 +59,13 @@ class GlobalSumPool(GlobalPooling):
 
     **Input**
 
-    - Node features of shape `([batch], N, F)`;
-    - Graph IDs of shape `(N, )` (only in disjoint mode);
+    - Node features of shape `([batch], n_nodes, n_node_features)`;
+    - Graph IDs of shape `(n_nodes, )` (only in disjoint mode);
 
     **Output**
 
-    - Pooled node features of shape `(batch, F)` (if single mode, shape will
-    be `(1, F)`).
+    - Pooled node features of shape `(batch, n_node_features)` (if single mode, shape will
+    be `(1, n_node_features)`).
 
     **Arguments**
 
@@ -88,13 +88,13 @@ class GlobalAvgPool(GlobalPooling):
 
     **Input**
 
-    - Node features of shape `([batch], N, F)`;
-    - Graph IDs of shape `(N, )` (only in disjoint mode);
+    - Node features of shape `([batch], n_nodes, n_node_features)`;
+    - Graph IDs of shape `(n_nodes, )` (only in disjoint mode);
 
     **Output**
 
-    - Pooled node features of shape `(batch, F)` (if single mode, shape will
-    be `(1, F)`).
+    - Pooled node features of shape `(batch, n_node_features)` (if single mode, shape will
+    be `(1, n_node_features)`).
 
     **Arguments**
 
@@ -117,13 +117,13 @@ class GlobalMaxPool(GlobalPooling):
 
     **Input**
 
-    - Node features of shape `([batch], N, F)`;
-    - Graph IDs of shape `(N, )` (only in disjoint mode);
+    - Node features of shape `([batch], n_nodes, n_node_features)`;
+    - Graph IDs of shape `(n_nodes, )` (only in disjoint mode);
 
     **Output**
 
-    - Pooled node features of shape `(batch, F)` (if single mode, shape will
-    be `(1, F)`).
+    - Pooled node features of shape `(batch, n_node_features)` (if single mode, shape will
+    be `(1, n_node_features)`).
 
     **Arguments**
 
@@ -152,8 +152,8 @@ class GlobalAttentionPool(GlobalPooling):
 
     **Input**
 
-    - Node features of shape `([batch], N, F)`;
-    - Graph IDs of shape `(N, )` (only in disjoint mode);
+    - Node features of shape `([batch], n_nodes, n_node_features)`;
+    - Graph IDs of shape `(n_nodes, )` (only in disjoint mode);
 
     **Output**
 
@@ -266,13 +266,13 @@ class GlobalAttnSumPool(GlobalPooling):
 
     **Input**
 
-    - Node features of shape `([batch], N, F)`;
-    - Graph IDs of shape `(N, )` (only in disjoint mode);
+    - Node features of shape `([batch], n_nodes, n_node_features)`;
+    - Graph IDs of shape `(n_nodes, )` (only in disjoint mode);
 
     **Output**
 
-    - Pooled node features of shape `(batch, F)` (if single mode, shape will
-    be `(1, F)`).
+    - Pooled node features of shape `(batch, n_node_features)` (if single mode, shape will
+    be `(1, n_node_features)`).
 
     **Arguments**
 
@@ -356,13 +356,13 @@ class SortPool(Layer):
     
     **Input**
 
-    - Node features of shape `([batch], N, F)`;
-    - Graph IDs of shape `(N, )` (only in disjoint mode);
+    - Node features of shape `([batch], n_nodes, n_node_features)`;
+    - Graph IDs of shape `(n_nodes, )` (only in disjoint mode);
 
     **Output**
 
-    - Pooled node features of shape `(batch, k, F)` (if single mode, shape will
-    be `(1, k, F)`).
+    - Pooled node features of shape `(batch, k, n_node_features)` (if single mode, shape will
+    be `(1, k, n_node_features)`).
 
     **Arguments**
 
diff --git a/spektral/layers/pooling/mincut_pool.py b/spektral/layers/pooling/mincut_pool.py
index f81fbdee..1c4888a6 100644
--- a/spektral/layers/pooling/mincut_pool.py
+++ b/spektral/layers/pooling/mincut_pool.py
@@ -39,14 +39,14 @@ class MinCutPool(Layer):
 
     **Input**
 
-    - Node features of shape `([batch], N, F)`;
-    - Binary adjacency matrix of shape `([batch], N, N)`;
+    - Node features of shape `([batch], n_nodes, n_node_features)`;
+    - Binary adjacency matrix of shape `([batch], n_nodes, n_nodes)`;
 
     **Output**
 
-    - Reduced node features of shape `([batch], K, F)`;
+    - Reduced node features of shape `([batch], K, n_node_features)`;
     - Reduced adjacency matrix of shape `([batch], K, K)`;
-    - If `return_mask=True`, the soft clustering matrix of shape `([batch], N, K)`.
+    - If `return_mask=True`, the soft clustering matrix of shape `([batch], n_nodes, K)`.
 
     **Arguments**
 
diff --git a/spektral/layers/pooling/sag_pool.py b/spektral/layers/pooling/sag_pool.py
index 5421fc5b..c3825b22 100644
--- a/spektral/layers/pooling/sag_pool.py
+++ b/spektral/layers/pooling/sag_pool.py
@@ -33,16 +33,16 @@ class SAGPool(TopKPool):
 
     **Input**
 
-    - Node features of shape `(N, F)`;
-    - Binary adjacency matrix of shape `(N, N)`;
-    - Graph IDs of shape `(N, )` (only in disjoint mode);
+    - Node features of shape `(n_nodes, n_node_features)`;
+    - Binary adjacency matrix of shape `(n_nodes, n_nodes)`;
+    - Graph IDs of shape `(n_nodes, )` (only in disjoint mode);
 
     **Output**
 
-    - Reduced node features of shape `(ratio * N, F)`;
-    - Reduced adjacency matrix of shape `(ratio * N, ratio * N)`;
-    - Reduced graph IDs of shape `(ratio * N, )` (only in disjoint mode);
-    - If `return_mask=True`, the binary pooling mask of shape `(ratio * N, )`.
+    - Reduced node features of shape `(ratio * n_nodes, n_node_features)`;
+    - Reduced adjacency matrix of shape `(ratio * n_nodes, ratio * n_nodes)`;
+    - Reduced graph IDs of shape `(ratio * n_nodes, )` (only in disjoint mode);
+    - If `return_mask=True`, the binary pooling mask of shape `(ratio * n_nodes, )`.
 
     **Arguments**
 
diff --git a/spektral/layers/pooling/topk_pool.py b/spektral/layers/pooling/topk_pool.py
index d90dceba..d57f2dae 100644
--- a/spektral/layers/pooling/topk_pool.py
+++ b/spektral/layers/pooling/topk_pool.py
@@ -37,16 +37,16 @@ class TopKPool(Layer):
 
     **Input**
 
-    - Node features of shape `(N, F)`;
-    - Binary adjacency matrix of shape `(N, N)`;
-    - Graph IDs of shape `(N, )` (only in disjoint mode);
+    - Node features of shape `(n_nodes, n_node_features)`;
+    - Binary adjacency matrix of shape `(n_nodes, n_nodes)`;
+    - Graph IDs of shape `(n_nodes, )` (only in disjoint mode);
 
     **Output**
 
-    - Reduced node features of shape `(ratio * N, F)`;
-    - Reduced adjacency matrix of shape `(ratio * N, ratio * N)`;
-    - Reduced graph IDs of shape `(ratio * N, )` (only in disjoint mode);
-    - If `return_mask=True`, the binary pooling mask of shape `(ratio * N, )`.
+    - Reduced node features of shape `(ratio * n_nodes, n_node_features)`;
+    - Reduced adjacency matrix of shape `(ratio * n_nodes, ratio * n_nodes)`;
+    - Reduced graph IDs of shape `(ratio * n_nodes, )` (only in disjoint mode);
+    - If `return_mask=True`, the binary pooling mask of shape `(ratio * n_nodes, )`.
 
     **Arguments**
 
diff --git a/spektral/transforms/constant.py b/spektral/transforms/constant.py
index 26534013..53ccd273 100644
--- a/spektral/transforms/constant.py
+++ b/spektral/transforms/constant.py
@@ -16,7 +16,7 @@ def __init__(self, value):
         self.value = value
 
     def __call__(self, graph):
-        value = np.zeros((graph.N, 1)) + self.value
+        value = np.zeros((graph.n_nodes, 1)) + self.value
         if graph.x is None:
             graph.x = value
         else:
diff --git a/tests/test_data/test_dataset.py b/tests/test_data/test_dataset.py
index 6ccfcbaf..75c9b147 100644
--- a/tests/test_data/test_dataset.py
+++ b/tests/test_data/test_dataset.py
@@ -21,9 +21,9 @@ def read(self):
 def test_dataset():
     d = TestDataset()
 
-    assert d.F == f
-    assert d.S == s
-    assert d.n_out == 2
+    assert d.n_node_features == f
+    assert d.n_edge_features == s
+    assert d.n_labels == 2
 
     # signature
     for k in ['x', 'a', 'e', 'y']:
@@ -41,15 +41,15 @@ def test_dataset():
 
     # single assignment
     d[0] = g
-    assert d[0].N == n and all([d_.N != n for d_ in d[1:]])
+    assert d[0].n_nodes == n and all([d_.n_nodes != n for d_ in d[1:]])
 
     # Slice assignment
     d[1:3] = [g] * 2
-    assert d[1].N == n and d[2].N == n and all([d_.N != n for d_ in d[3:]])
+    assert d[1].n_nodes == n and d[2].n_nodes == n and all([d_.n_nodes != n for d_ in d[3:]])
 
     # List assignment
     d[[3, 4]] = [g] * 2
-    assert d[3].N == n and d[4].N == n and all([d_.N != n for d_ in d[5:]])
+    assert d[3].n_nodes == n and d[4].n_nodes == n and all([d_.n_nodes != n for d_ in d[5:]])
 
     # __len__
     assert d.__len__() == n_graphs
diff --git a/tests/test_data/test_graph.py b/tests/test_data/test_graph.py
index ddc53284..c747b0b6 100644
--- a/tests/test_data/test_graph.py
+++ b/tests/test_data/test_graph.py
@@ -9,13 +9,14 @@
 
 
 def _check_graph(x, a, e, y):
+    g = Graph()
     g = Graph(x=x)
     g = Graph(a=a)
     g = Graph(x=x, a=a, e=e, y=y)
 
     # numpy
     g_np = g.numpy()
-    g_gt_names = ['x', 'adj', 'edge_attr', 'y']
+    g_gt_names = ['x', 'a', 'e', 'y']
     g_gt = [x, a, e, y]
     for i in range(len(g_gt)):
         assert np.all(g_np[i] == g_gt[i])
diff --git a/tests/test_data/test_loaders.py b/tests/test_data/test_loaders.py
index a75ea103..b6ae16cd 100644
--- a/tests/test_data/test_loaders.py
+++ b/tests/test_data/test_loaders.py
@@ -55,7 +55,7 @@ def read(self):
 
 def test_single():
     data = TestDatasetSingle()
-    n = data.N
+    n = data.n_nodes
     loader = SingleLoader(data, sample_weights=np.ones(n), epochs=1)
     batches = [b for b in loader]
     assert len(batches) == 1
diff --git a/tests/test_data/test_utils.py b/tests/test_data/test_utils.py
index fea2f3e7..abbe4bb4 100644
--- a/tests/test_data/test_utils.py
+++ b/tests/test_data/test_utils.py
@@ -48,4 +48,4 @@ def read(self):
     for batch in batches:
         a_, b_, c_ = batch
         for i in range(len(a_)):
-            assert a_[i] == b_[i] == c_[i].N
+            assert a_[i] == b_[i] == c_[i].n_nodes
diff --git a/tests/test_transforms/test_transforms.py b/tests/test_transforms/test_transforms.py
index dd2cc073..73c7b4bd 100644
--- a/tests/test_transforms/test_transforms.py
+++ b/tests/test_transforms/test_transforms.py
@@ -96,4 +96,4 @@ def test_one_hot():
     g = Graph(x=x, a=a, e=e, y=y_nl)
     t(g)
     g = Graph(x=x, a=a, e=e, y=y_sc)
-    t(g)
\ No newline at end of file
+    t(g)

From dcc7c38ca5e43d40a23a3435c9637204ce1ac59a Mon Sep 17 00:00:00 2001
From: Daniele Grattarola <daniele.grattarola@gmail.com>
Date: Mon, 23 Nov 2020 18:05:11 +0100
Subject: [PATCH 36/57] Update "Getting started" example Rename Loader.tf() to
 Loader.load() Make most loaders work even for TF < 2.4

---
 README.md                                     |   8 +-
 docs/templates/getting-started.md             | 254 ++++++++++++------
 examples/graph_prediction/qm9_batch.py        |   4 +-
 examples/node_prediction/citation_arma.py     |   6 +-
 examples/node_prediction/citation_cheby.py    |   6 +-
 examples/node_prediction/citation_gat.py      |   6 +-
 examples/node_prediction/citation_gcn.py      |   6 +-
 .../node_prediction/citation_simple_gc.py     |   6 +-
 spektral/data/dataset.py                      |   6 +-
 spektral/data/loaders.py                      |  32 +--
 10 files changed, 212 insertions(+), 122 deletions(-)

diff --git a/README.md b/README.md
index ac5381eb..03a22cf5 100644
--- a/README.md
+++ b/README.md
@@ -36,7 +36,13 @@ See how to [get started with Spektral](https://graphneural.network/getting-start
 
 The source code of the project is available on [Github](https://github.com/danielegrattarola/spektral).  
 Read the documentation [here](https://graphneural.network).  
-You can also cite the paper introducing Spektral: [Graph Neural Networks in TensorFlow and Keras with Spektral](https://arxiv.org/abs/2006.12138) (ICML 2020 - GRL+ Workshop). 
+
+If you want to cite Spektral in your work, refer to our paper: 
+
+> Graph Neural Networks in TensorFlow and Keras with Spektral  
+> D. Grattarola and C. Alippi  
+> ICML 2020 - GRL+ Workshop  
+> [https://arxiv.org/abs/2006.12138](https://arxiv.org/abs/2006.12138)  
 
 ## Installation
 Spektral is compatible with Python 3.5+, and is tested on Ubuntu 16.04+ and MacOS. 
diff --git a/docs/templates/getting-started.md b/docs/templates/getting-started.md
index 4822bd25..d328beed 100644
--- a/docs/templates/getting-started.md
+++ b/docs/templates/getting-started.md
@@ -1,168 +1,248 @@
 # Getting started
 
-Spektral is designed according to the guiding principles of the Keras API to make things extremely simple for beginners while maintaining flexibility for experts and researchers.  
+Spektral is designed according to the guiding principles of the Keras API to make things extremely simple for beginners while maintaining flexibility for experts.  
 
-The most important modules of Spektral are `layers.convolutional` and `layers.pooling`, which offer a number of popular layers to start building graph neural networks (GNNs) right away.     
-Because Spektral is designed as an extension of Keras, you can plug any Spektral layer into an existing Keras `Model` without modifications. 
-
-In this page we will go over the main features of Spektral while creating a GNN for graph classification. 
+In this page we will go over the main features of Spektral while creating a graph neural network for graph classification. 
 
 ## Graphs
 
 A graph is a mathematical object that represents relations between objects. We call the objects "nodes" and the relations "edges". 
 
-Both the nodes and the edges can have vector attributes (or features).
+Both the nodes and the edges can have vector **features**.
 
-In Spektral, graphs are represented with instances of `spektral.data.Graph` which contain:
+In Spektral, graphs are represented with instances of `spektral.data.Graph` which can contain:
 
 - `a`: the **adjacency matrix** - usually a `scipy.sparse` matrix of shape `(n_nodes, n_nodes)`. 
-- `x`: the **node attributes** - represented by a `np.array` of shape `(n_nodes, n_node_attributes)`.
-- `e`: the **edge attributes** - usually represented in a sparse edge list format, with a `np.array` of shape `(n_edges, n_edge_attributes)`.
+- `x`: the **node features** - represented by a `np.array` of shape `(n_nodes, n_node_features)`.
+- `e`: the **edge features** - usually represented in a sparse edge list format, with a `np.array` of shape `(n_edges, n_edge_features)`.
 - `y`: the **labels** - can represent anything, from graph labels to node labels, or even something else. 
 
+A graph can have all of these attributes or none of them. You can even add extra attributes if you want: after all, a `Graph` is just a plain Python object. For instance, see `graph.n_nodes`, `graph.n_node_features`, etc.
+
+## Datasets
 
+The `spektral.data.Dataset` container provides some useful functionality to manipulate collections of graphs.
 
+Let's load a popular benchmark dataset for graph classification: 
 
+```python
+>>> from spektral.datasets import TUDataset
 
+>>> dataset = TUDataset('PROTEINS')
 
+>>> dataset
+TUDataset(n_graphs=1113)
+```
+
+We can now retrieve individual graphs:
+
+```python
+>>> dataset[0]
+Graph(n_nodes=42, n_node_features=4, n_edge_features=None, y=[1. 0.])
+```
 
+or shuffle the data:
 
+```python
+>>> np.random.shuffle(dataset)
+```
 
+or slice the dataset up into sub-datsets: 
 
+```python
+>>> dataset[:100]
+TUDataset(n_graphs=100)
+```
 
+Datasets also provide methods for applying **transforms** to each data: 
 
+- `apply(transform)` - modifies the dataset in-place, by applying the `transform` to each graph;
+- `map(transform)` - returns a list obtained by applying the `transform` to each graph;
+- `filter(function)` - removes from the dataset any graph for which `function(graph)` is `False`. This is also an in-place operation.
 
-## Node classification on citation networks
+For exampe, let's modify our dataset so that we only have graphs with less than 500 nodes:
 
-In this example, we will build a simple [Graph Convolutional Network](https://arxiv.org/abs/1609.02907) for semi-supervised classification of nodes.
+```python
+>>> dataset.filter(lambda g: g.n_nodes < 500)
 
-This is a simple but challenging task that consists of classifying text documents in a **citation network**.   
-In this type of graph, each node represents a document and is associated to a binary bag-of-words attribute (1 if a given word appears in the text, 0 otherwise). 
-If a document cites another, then there exist an undirected edge between the two corresponding nodes. 
-Finally, each node has a class label that we want to predict. 
+>>> dataset
+TUDataset(n_graphs=1111)  # removed 2 graphs
+``` 
 
-This is a **transductive** learning setting, where we observe all of the nodes and edges at training time, but only a fraction of the labels. The goal is to learn to predict the missing labels.
+Now let's apply some transforms to our graphs. For example, we can modify each graph so that the node features also contain the one-hot-encoded degree of the nodes.
 
-The `datasets.citation` module of Spektral lets you download and load three popular citation datasets (Cora, Citeseer and Pubmed) in one line of code. For instance, loading the Cora dataset is as simple as: 
+First, we compute the maximum degree of the dataset, so that we know the size of the one-hot vectors: 
 
 ```python
-from spektral.datasets import citation
-A, X, y, train_mask, val_mask, test_mask = citation.load_data('cora')
+>>> max_degree = dataset.map(lambda g: g.a.sum(-1).max(), reduce=max)
 
-N = A.shape[0]
-F = X.shape[-1]
-n_classes = y.shape[-1]
+>>> max_degree
+12
 ```
 
-This will load the network's adjacency matrix `A` as a Scipy sparse matrix of shape `(N, N)`, the node features `X` of shape `(N, F)`, and the labels `y` of shape `(N, n_classes)`. The loader will also return some boolean masks to know which nodes belong to the training, validation and test sets (`train_mask, val_mask, test_mask`).
+Try to go over the lambda function to see what it does. Also, notice that we passed another function to the `reduce` keyword. Can you guess why?
 
+Now we are ready to augment our node features with the one-hot-encoded degree. Spektral has a lot of pre-implemented `transforms` that we can use: 
 
-## Creating a GNN
+```python
+>>> from spektral.transforms import Degree
+
+>>> dataset.apply(Degree(max_degree))
+```
 
-To create a GCN, we will use the `GraphConv` layer and the functional API of Keras:
+We can see that it worked because now we have and extra `max_degree + 1` node features, which are our one-hot vectors:
 
 ```python
-from spektral.layers import GraphConv
-from tensorflow.keras.models import Model
-from tensorflow.keras.layers import Input, Dropout
+>>> dataset[0]
+Graph(n_nodes=42, n_node_features=17, n_edge_features=None, y=[1. 0.])
 ```
 
-Building the model is no different than building any Keras model, but we will need to provide multiple inputs (`X` and `A`) to the `GraphConv` layers:
+Since we will be using a `GraphConv` layer in our GNN, we also want to follow the [original paper](https://arxiv.org/abs/1609.02907) that introduced this layer, and do some extra pre-processing. 
+
+Specifically, we need to normalize the adjacency matrix of each graph by the node degrees. Since this is a fairly common operation, Spektral has a transform to do it: 
 
 ```python
-X_in = Input(shape=(F, ))
-A_in = Input((N, ), sparse=True)
+>>> from spektral.transforms import GCNFilter
+
+>>> dataset.apply(GCNFilter())
+```
+
+Many layers will require you to do some form of preprocessing. If you don't want to go back to the literature every time, you can use the handy [`LayerPreprocess` transform](/transforms/#layerpreprocess).
+
+
+## Creating a GNN
+
+Creating GNNs is where Spektral really shines. Since Spektral is designed as an extension of Keras, you can plug any Spektral layer into a Keras `Model` without modifications.  
+We just need to use the functional API because GNN layers usually need two or more inputs (so no `Sequential` models for now). 
 
-X_1 = GraphConv(16, 'relu')([X_in, A_in])
-X_1 = Dropout(0.5)(X_1)
-X_2 = GraphConv(n_classes, 'softmax')([X_1, A_in])
+For our first GNN, we will create a simple network that first does a bit of graph convolution, then sums all the nodes together (known as "global pooling"), and finally classifies the result with a dense softmax layer.  
+Oh, and we will also use dropout for regularization.
 
-model = Model(inputs=[X_in, A_in], outputs=X_2)
+Let's start by importing the necessary layers:
+
+```python
+from tensorflow.keras.models import Model
+from tensorflow.keras.layers import Dense, Dropout
+from spektral.layers import GraphConv, GlobalSumPool
 ```
 
-And that's it. We just built our first GNN in Spektral and Keras. 
+Now we can use model subclassing to define our model:
 
-Note how we used the familiar API of Keras to create the GCN layers, as well as the standard `Dropout` layer to regularize our model. All features of Keras are also supported by Spektral (including initializers, regularizers, etc.).
+```python
+class MyFirstGNN(Model):
+
+    def __init__(self, n_hidden, n_labels):
+        super().__init__()
+        self.graph_conv = GraphConv(n_hidden)
+        self.pool = GlobalSumPool()
+        self.dropout = Dropout(0.5)
+        self.dense = Dense(n_labels, 'softmax')
+
+    def call(self, inputs):
+        out = self.graph_conv(inputs)
+        out = self.dropout(out)
+        out = self.pool(out)
+        out = self.dense(out)
+
+        return out
+``` 
 
-An important thing to notice at this point is how we defined the `Input` layers of our model. 
-Because the "elements" of our dataset are the node themselves, we are telling Keras to consider each node as a separate sample so that the `batch` axis is implicitly defined as `None`.  
-In other words, a sample of the node attributes will be a vector of shape `(F, )` and a sample of the adjacency matrix will be one row of shape `(N, )`. 
+And that's it.
 
-Keep this detail in mind for later. 
+Note how we mixed layers from Spektral and Keras interchangeably: it's all just computation with tensors underneath! 
 
-## Training the GNN
+This also means that if you want to break free from `Graph` and `Dataset` and every other feature of Spektral, you can. 
+
+**Note:** If you don't want to subclass `Model` to implement your GNN, you can also use the classical declarative style. You just need to pay attention to the `Input` and leave "node" dimensions unspecified (so `None` instead of `n_nodes`).
 
-When training GCN, we have to pre-process the adjacency matrix to 1) add self-loops and 2) scale the weights of a node's connections according to its degree.
 
-Some layers in Spektral require a different type of pre-processing in order to work correctly, and some work out-of-the-box on the binary `A`. 
-The pre-processing required by each layer is available as a static class method `preprocess()`.
+## Training the GNN
 
-In our example, the pre-processing required by GCN is: 
+Now we're ready to train the GNN. First, we instantiate and compile our model: 
 
 ```python
-A = GraphConv.preprocess(A).astype('f4')
+model = MyFirstGNN(32, dataset.n_labels)
+model.compile('adam', 'categorical_crossentropy')
 ```
 
-And that's all! 
-What's left now for us is to compile and train our model: 
+and we're almost there!
+
+However, here's where graphs get in our way. Unlike regular data, like images or sequences, graphs cannot be stretched or cut or reshaped so that we can fit them into tensors of pre-defined shape. If a graph has 10 nodes and another one has 4, we have to keep them that way. 
+
+This means that iterating over a dataset in mini-batches is not trivial and we cannot simply use the `model.fit()` method of Keras as-is. 
+
+We have to use a data `Loader`.
+
+### Loaders
+
+Loaders iterate over a graph dataset to create mini-batches. They hide a lot of the complexity behind the process, so that you don't need to think about it. 
+You only need to go to [this page](/data-modes) and read up on **data modes**, so that you know which loader to use. 
+
+Each loader has a `load()` method that when called will return a data generator that Keras can process. 
+
+Since we're doing graph-level classification, we can use a `BatchLoader`. It's a bit slow and memory intensive (a `DisjointLoader` would have been better), but it lets us simplify the definition of `MyFirstGNN`. Again, go read about [data modes](/data-modes) after this tutorial.
+
+Let's create a data loader:
 
 ```python
-model.compile(optimizer='adam',
-              loss='categorical_crossentropy',
-              weighted_metrics=['acc'])
-model.summary()
+from spektral.data import BatchLoader
+
+loader = BatchLoader(dataset_train, batch_size=32)
 ```
 
-Note that we used the `weighted_metrics` argument instead of the usual `metrics`. This is due to the particular semi-supervised problem that we are dealing with, and has to do with the boolean masks that we loaded earlier (more on that later).
+and we can finally train our GNN!
 
-We can now train the model using the native `fit()` method of Keras:
+Since loaders are essentially generators, we need to provide the `steps_per_epoch` keyword to `model.fit()` and we don't need to specify a batch size:
 
 ```python
-# Prepare data
-X = X.toarray()
-A = A.astype('f4')
-validation_data = ([X, A], y, val_mask)
-
-# Train model
-model.fit([X, A], y,
-          sample_weight=train_mask,
-          validation_data=validation_data,
-          batch_size=N,
-          shuffle=False)
+model.fit(loader.load(),
+          steps_per_epoch=loader.steps_per_epoch,
+          epochs=10)
 ``` 
 
-There are a couple of things to note here.
+Done!
 
-We have set `batch_size=N` and `shuffle=False`. This is because the default behaviour of Keras is to split the data into batches of 32 and shuffle the samples at each epoch. 
-However, shuffling the adjacency matrix along one axis and not the other means that row `i` will represent a different node than column `i`.  
-At the same time, if we split the graph into batches we may end up in a situation where we need to use a node attribute that is not part of the batch. The only solution is to take all the node features at the same time, hence `batch_size=N`.
+## Evaluating the GNN
 
-Finally, we used `train_mask` and `val_mask` as `sample_weight`.   
-This means that, during training, the training nodes will have a weight of 1 and the validation nodes will have a weight of 0. Then, in validation, we will set the training nodes to have a weight of 0 and the validation nodes to have a weight of 1. 
+Evaluating the performance of our model, be it for testing or validation, follows a similar workflow. 
 
-This is all that we need to do to differentiate between training and test data. See how the model takes as input the full `X`, `A`, and `y` for both training and valdation? The only thing that changes is the mask. This is also why we used the `weighted_metrics` keyword when compiling the model, so that our accuracy is calculated only on the correct nodes at each phase. 
+We create a data loader: 
 
-## Evaluating the model
+```python
+from spektral.data import BatchLoader
+
+loader = BatchLoader(dataset_test, batch_size=32)
+```
 
-Once again, evaluation is done in vanilla Keras. We just have to keep in mind the same considerations about batching that we did for training (note that in `model.evaluate()` by default `shuffle=False`): 
+and feed it to the model by calling `load()`:
 
 ```python
-# Evaluate model
-eval_results = model.evaluate([X, A],
-                              y,
-                              sample_weight=test_mask,
-                              batch_size=N)
-print('Done.\n'
-      'Test loss: {}\n'
-      'Test accuracy: {}'.format(*eval_results))
+loss = model.evaluate(loader.load(), 
+                      steps=loader.steps_per_epoch)
+print('Test loss: {}'.format(loss))
 ```
 
+## Node-level learning
+
+Besides learning to predict labels for the whole graph, like in this tutorial, GNNs are very effective at learning to predict labels for each individual node. This is called "node-level learning" and we usually do it for datasets with one big graph (think a social network).
+
+For example, reproducing the results of the [GCN paper for classifying nodes in a citation network](https://arxiv.org/abs/1609.02907) can be done with `GraphConv` layers, the `Citation` dataset, and a `SingleLoader`: check out [this example](https://github.com/danielegrattarola/spektral/blob/master/examples/node_prediction/citation_gcn.py).
+
+As a matter of fact, check out [all the examples](/examples).
+
 ## Go create!
 
-You are now ready to use Spektral to create your own models. 
+You are now ready to use Spektral to create your own GNNs. 
+
+If you want to build a GNN for a specific task, chances are that everything you need is already in Spektral. Check out the [examples](https://github.com/danielegrattarola/spektral/tree/master/examples) for some ideas and practical tips.
+
+Remember to read the [data modes section](/data-modes) to learn about representing graphs and creating mini-batches. 
 
-If you want to build a GNN for a specific task, chances are that everything you need is already part of Spektral. Check the [examples](https://github.com/danielegrattarola/spektral/tree/master/examples) for some ideas and practical tips.
+Make sure to read the documentation, and get in touch [on Github](https://github.com/danielegrattarola/spektral) if you have a feature that you want to see implemented. 
 
-Remember to read the [data representation section](https://danielegrattarola.github.io/spektral/data/) to learn different ways of representing a graph or batches of different graphs. 
+If you want to cite Spektral in your work, refer to our paper: 
 
-Make sure to check the documentation, and get in touch [on Github](https://github.com/danielegrattarola/spektral) if you have a feature that you want to see implemented. 
+> Graph Neural Networks in TensorFlow and Keras with Spektral  
+> D. Grattarola and C. Alippi  
+> ICML 2020 - GRL+ Workshop  
+> [https://arxiv.org/abs/2006.12138](https://arxiv.org/abs/2006.12138)  
diff --git a/examples/graph_prediction/qm9_batch.py b/examples/graph_prediction/qm9_batch.py
index 81540275..1618baac 100644
--- a/examples/graph_prediction/qm9_batch.py
+++ b/examples/graph_prediction/qm9_batch.py
@@ -57,7 +57,7 @@
 # FIT MODEL
 ################################################################################
 loader_tr = BatchLoader(dataset_tr, batch_size=batch_size)
-model.fit(loader_tr,
+model.fit(loader_tr.load(),
           steps_per_epoch=loader_tr.steps_per_epoch,
           epochs=epochs)
 
@@ -66,5 +66,5 @@
 ################################################################################
 print('Testing model')
 loader_te = BatchLoader(dataset_te, batch_size=batch_size)
-model_loss = model.evaluate(loader_te, steps=loader_tr.steps_per_epoch)
+model_loss = model.evaluate(loader_te.load(), steps=loader_te.steps_per_epoch)
 print('Done. Test loss: {}'.format(model_loss))
diff --git a/examples/node_prediction/citation_arma.py b/examples/node_prediction/citation_arma.py
index 0d87144d..054ddc05 100644
--- a/examples/node_prediction/citation_arma.py
+++ b/examples/node_prediction/citation_arma.py
@@ -71,9 +71,9 @@
 # Train model
 loader_tr = SingleLoader(dataset, sample_weights=mask_tr)
 loader_va = SingleLoader(dataset, sample_weights=mask_va)
-model.fit(loader_tr.tf(),
+model.fit(loader_tr.load(),
           steps_per_epoch=loader_tr.steps_per_epoch,
-          validation_data=loader_va.tf(),
+          validation_data=loader_va.load(),
           validation_steps=loader_va.steps_per_epoch,
           epochs=epochs,
           callbacks=[EarlyStopping(patience=patience, restore_best_weights=True)])
@@ -81,7 +81,7 @@
 # Evaluate model
 print('Evaluating model.')
 loader_te = SingleLoader(dataset, sample_weights=mask_te)
-eval_results = model.evaluate(loader_te.tf(), steps=loader_te.steps_per_epoch)
+eval_results = model.evaluate(loader_te.load(), steps=loader_te.steps_per_epoch)
 print('Done.\n'
       'Test loss: {}\n'
       'Test accuracy: {}'.format(*eval_results))
diff --git a/examples/node_prediction/citation_cheby.py b/examples/node_prediction/citation_cheby.py
index 0482c121..5954fcf1 100644
--- a/examples/node_prediction/citation_cheby.py
+++ b/examples/node_prediction/citation_cheby.py
@@ -63,9 +63,9 @@
 # Train model
 loader_tr = SingleLoader(dataset, sample_weights=mask_tr)
 loader_va = SingleLoader(dataset, sample_weights=mask_va)
-model.fit(loader_tr.tf(),
+model.fit(loader_tr.load(),
           steps_per_epoch=loader_tr.steps_per_epoch,
-          validation_data=loader_va.tf(),
+          validation_data=loader_va.load(),
           validation_steps=loader_va.steps_per_epoch,
           epochs=epochs,
           callbacks=[EarlyStopping(patience=patience, restore_best_weights=True)])
@@ -73,7 +73,7 @@
 # Evaluate model
 print('Evaluating model.')
 loader_te = SingleLoader(dataset, sample_weights=mask_te)
-eval_results = model.evaluate(loader_te.tf(), steps=loader_te.steps_per_epoch)
+eval_results = model.evaluate(loader_te.load(), steps=loader_te.steps_per_epoch)
 print('Done.\n'
       'Test loss: {}\n'
       'Test accuracy: {}'.format(*eval_results))
diff --git a/examples/node_prediction/citation_gat.py b/examples/node_prediction/citation_gat.py
index 3aab06fc..8ff32203 100644
--- a/examples/node_prediction/citation_gat.py
+++ b/examples/node_prediction/citation_gat.py
@@ -69,9 +69,9 @@
 # Train model
 loader_tr = SingleLoader(dataset, sample_weights=mask_tr)
 loader_va = SingleLoader(dataset, sample_weights=mask_va)
-model.fit(loader_tr.tf(),
+model.fit(loader_tr.load(),
           steps_per_epoch=loader_tr.steps_per_epoch,
-          validation_data=loader_va.tf(),
+          validation_data=loader_va.load(),
           validation_steps=loader_va.steps_per_epoch,
           epochs=epochs,
           callbacks=[EarlyStopping(patience=patience, restore_best_weights=True)])
@@ -79,7 +79,7 @@
 # Evaluate model
 print('Evaluating model.')
 loader_te = SingleLoader(dataset, sample_weights=mask_te)
-eval_results = model.evaluate(loader_te.tf(), steps=loader_te.steps_per_epoch)
+eval_results = model.evaluate(loader_te.load(), steps=loader_te.steps_per_epoch)
 print('Done.\n'
       'Test loss: {}\n'
       'Test accuracy: {}'.format(*eval_results))
diff --git a/examples/node_prediction/citation_gcn.py b/examples/node_prediction/citation_gcn.py
index 1ef604ae..05dc06b9 100644
--- a/examples/node_prediction/citation_gcn.py
+++ b/examples/node_prediction/citation_gcn.py
@@ -59,9 +59,9 @@
 # Train model
 loader_tr = SingleLoader(dataset, sample_weights=mask_tr)
 loader_va = SingleLoader(dataset, sample_weights=mask_va)
-model.fit(loader_tr.tf(),
+model.fit(loader_tr.load(),
           steps_per_epoch=loader_tr.steps_per_epoch,
-          validation_data=loader_va.tf(),
+          validation_data=loader_va.load(),
           validation_steps=loader_va.steps_per_epoch,
           epochs=epochs,
           callbacks=[EarlyStopping(patience=patience, restore_best_weights=True)])
@@ -69,7 +69,7 @@
 # Evaluate model
 print('Evaluating model.')
 loader_te = SingleLoader(dataset, sample_weights=mask_te)
-eval_results = model.evaluate(loader_te.tf(), steps=loader_te.steps_per_epoch)
+eval_results = model.evaluate(loader_te.load(), steps=loader_te.steps_per_epoch)
 print('Done.\n'
       'Test loss: {}\n'
       'Test accuracy: {}'.format(*eval_results))
diff --git a/examples/node_prediction/citation_simple_gc.py b/examples/node_prediction/citation_simple_gc.py
index f9770c1b..c815bee2 100644
--- a/examples/node_prediction/citation_simple_gc.py
+++ b/examples/node_prediction/citation_simple_gc.py
@@ -71,9 +71,9 @@ def __call__(self, graph):
 # Train model
 loader_tr = SingleLoader(dataset, sample_weights=mask_tr)
 loader_va = SingleLoader(dataset, sample_weights=mask_va)
-model.fit(loader_tr.tf(),
+model.fit(loader_tr.load(),
           steps_per_epoch=loader_tr.steps_per_epoch,
-          validation_data=loader_va.tf(),
+          validation_data=loader_va.load(),
           validation_steps=loader_va.steps_per_epoch,
           epochs=epochs,
           callbacks=[EarlyStopping(patience=patience, restore_best_weights=True)])
@@ -81,7 +81,7 @@ def __call__(self, graph):
 # Evaluate model
 print('Evaluating model.')
 loader_te = SingleLoader(dataset, sample_weights=mask_te)
-eval_results = model.evaluate(loader_te.tf(), steps=loader_te.steps_per_epoch)
+eval_results = model.evaluate(loader_te.load(), steps=loader_te.steps_per_epoch)
 print('Done.\n'
       'Test loss: {}\n'
       'Test accuracy: {}'.format(*eval_results))
diff --git a/spektral/data/dataset.py b/spektral/data/dataset.py
index 3e8c7cdf..127f7984 100644
--- a/spektral/data/dataset.py
+++ b/spektral/data/dataset.py
@@ -179,12 +179,16 @@ def __len__(self):
         return len(self.graphs)
 
     def __repr__(self):
-        return '{}({})'.format(self.__class__.__name__, self.__len__())
+        return '{}(n_graphs={})'.format(self.__class__.__name__, self.n_graphs)
 
     @property
     def path(self):
         return osp.join(DATASET_FOLDER, self.__class__.__name__)
 
+    @property
+    def n_graphs(self):
+        return self.__len__()
+
     @property
     def n_nodes(self):
         if len(self.graphs) == 1 or len(set([g.n_nodes for g in self.graphs])) == 1:
diff --git a/spektral/data/loaders.py b/spektral/data/loaders.py
index bbf2b3d8..6aca53a7 100644
--- a/spektral/data/loaders.py
+++ b/spektral/data/loaders.py
@@ -46,19 +46,16 @@ def collate(self, batch):
     Additionally, a Loader should implement two main methods that simplify its
     usage within the TensorFlow/Keras training pipeline:
 
-    - `tf()`: should return a `tf.data` dataset, a generator, or a
+    - `load()`: should return a `tf.data` dataset, a generator, or a
     `keras.utils.Sequence`. Its usage pattern should be as follows:
 
-    `model.fit(loader.tf(), steps_per_epoch=loader.steps_per_epoch)`
+    `model.fit(loader.load(), steps_per_epoch=loader.steps_per_epoch)`
 
     The `steps_per_epoch` property returns the number of batches
     (as specified by the `batch_size` argument) that are in an epoch and is
     automatically computed from the data.
 
-    Note that TensorFlow 2.4 or above is required to use this method in a
-    Keras training loop.
-
-    By default, `tf()` will simply return a `tf.data.Dataset.from_generator`
+    By default, `load()` will simply return a `tf.data.Dataset.from_generator`
     dataset obtained from the Loader itself (since Loaders are also Python
     generators).
 
@@ -101,12 +98,8 @@ def generator(self):
     def collate(self, batch):
         raise NotImplementedError
 
-    def tf(self):
-        if not tf_loader_available:
-            raise RuntimeError('Calling Loader.tf() requires TensorFlow 2.4 '
-                               'or greater.')
-        return tf.data.Dataset.from_generator(
-            lambda: self, output_signature=self.tf_signature())
+    def load(self):
+        return self
 
     def tf_signature(self):
         signature = self.dataset.signature
@@ -132,9 +125,6 @@ class SingleLoader(Loader):
     argument. If given, then each batch will be a tuple
     `(inputs, labels, sample_weights)`.
 
-    The `tf()` method of this loader can be used even with TensorFlow versions
-    below 2.4.
-
     **Arguments**
 
     - `dataset`: a graph Dataset;
@@ -181,7 +171,7 @@ def collate(self, batch):
             output += [self.sample_weights]
         return tuple(output)
 
-    def tf(self):
+    def load(self):
         output = self.collate(self.dataset)
         return tf.data.Dataset.from_tensors(output).repeat(self.epochs)
 
@@ -208,6 +198,9 @@ class DisjointLoader(Loader):
     If `node_level=True`, then the labels are stacked vertically (i.e.,
     `(n_nodes, n_labels)`).
 
+    Note that TensorFlow 2.4 or above is required to use this Loader's `load()`
+    method in a Keras training loop.
+
     **Arguments**
 
     - `dataset`: a graph Dataset;
@@ -256,6 +249,13 @@ def collate(self, batch):
 
         return output, y
 
+    def load(self):
+        if not tf_loader_available:
+            raise RuntimeError('Calling DisjointLoader.load() requires '
+                               'TensorFlow 2.4 or greater.')
+        return tf.data.Dataset.from_generator(
+            lambda: self, output_signature=self.tf_signature())
+
     def tf_signature(self):
         signature = self.dataset.signature
         if 'y' in signature:

From 7b63a13797cbaff00d003d5eb7df67299fd02fdf Mon Sep 17 00:00:00 2001
From: Daniele Grattarola <daniele.grattarola@gmail.com>
Date: Tue, 24 Nov 2020 12:30:28 +0100
Subject: [PATCH 37/57] Fix example

---
 examples/graph_prediction/custom_dataset.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/graph_prediction/custom_dataset.py b/examples/graph_prediction/custom_dataset.py
index db3032b3..b6e7e9ca 100644
--- a/examples/graph_prediction/custom_dataset.py
+++ b/examples/graph_prediction/custom_dataset.py
@@ -50,8 +50,8 @@ class MyDataset(Dataset):
     The graphs have `n_colors` colors, of at least `n_min` and at most `n_max`
     nodes connected with probability `p`.
     """
-    def __init__(self, n_graphs, n_colors=3, n_min=10, n_max=100, p=0.5, **kwargs):
-        self.n_graphs = n_graphs
+    def __init__(self, n_samples, n_colors=3, n_min=10, n_max=100, p=0.5, **kwargs):
+        self.n_samples = n_samples
         self.n_colors = n_colors
         self.n_min = n_min
         self.n_max = n_max
@@ -78,7 +78,7 @@ def make_graph():
             return Graph(x=x, a=a, y=y)
 
         # We must return a list of Graph objects
-        return [make_graph() for _ in range(self.n_graphs)]
+        return [make_graph() for _ in range(self.n_samples)]
 
 
 dataset = MyDataset(1000, transforms=NormalizeAdj())

From cb68a740064544c7a1bd7ad8db04ebc8202a9fdb Mon Sep 17 00:00:00 2001
From: Daniele Grattarola <daniele.grattarola@gmail.com>
Date: Tue, 24 Nov 2020 18:05:49 +0100
Subject: [PATCH 38/57] Add MNIST Dataset Update MNIST example Improve
 BatchLoaders so that they are compatible with Keras if there is only one
 Tensor in inputs Arguments for to_batch and to_disjoint utils are now
 optional to support partially defined Graphs

---
 .../graph_signal_classification_mnist.py      | 126 ++++++++----------
 spektral/data/loaders.py                      |   7 +-
 spektral/data/utils.py                        |  91 +++++++------
 spektral/datasets/__init__.py                 |   2 +-
 spektral/datasets/citation.py                 |   3 +-
 spektral/datasets/graphsage.py                |   2 +-
 spektral/datasets/mnist.py                    |  91 ++++++-------
 tests/test_data/test_dataset.py               |  14 +-
 tests/test_data/test_utils.py                 |   7 +-
 tests/test_datasets.py                        |   6 +-
 10 files changed, 170 insertions(+), 179 deletions(-)

diff --git a/examples/other/graph_signal_classification_mnist.py b/examples/other/graph_signal_classification_mnist.py
index a109c1fc..470f6661 100644
--- a/examples/other/graph_signal_classification_mnist.py
+++ b/examples/other/graph_signal_classification_mnist.py
@@ -1,34 +1,32 @@
 import numpy as np
-import tensorflow as tf
 from tensorflow.keras import Model
 from tensorflow.keras.layers import Dense, Flatten
-from tensorflow.keras.losses import SparseCategoricalCrossentropy
-from tensorflow.keras.metrics import SparseCategoricalAccuracy
-from tensorflow.keras.optimizers import Adam
 from tensorflow.keras.regularizers import l2
 
-from spektral.datasets import mnist
+from spektral.data import PackedBatchLoader
+from spektral.datasets.mnist import MNIST
 from spektral.layers import GraphConv
 from spektral.layers.ops import sp_matrix_to_sp_tensor
-from spektral.data.utils import batch_generator
 
 # Parameters
-learning_rate = 1e-3  # Learning rate for Adam
-batch_size = 32       # Batch size
-epochs = 1000         # Number of training epochs
-patience = 10         # Patience for early stopping
-l2_reg = 5e-4         # Regularization rate for l2
+batch_size = 32  # Batch size
+epochs = 1000    # Number of training epochs
+patience = 10    # Patience for early stopping
+l2_reg = 5e-4    # Regularization rate for l2
 
 # Load data
-x_tr, y_tr, x_va, y_va, x_te, y_te, A = mnist.load_data()
-x_tr, x_va, x_te = x_tr[..., None], x_va[..., None], x_te[..., None]
-N = x_tr.shape[-2]    # Number of nodes in the graphs
-F = x_tr.shape[-1]    # Node features dimensionality
-n_out = 10            # Dimension of the target
+data = MNIST()
 
-# Create filter for GCN and convert to sparse tensor
-fltr = GraphConv.preprocess(A)
-fltr = sp_matrix_to_sp_tensor(fltr)
+# The adjacency matrix is stored as an attribute of the dataset.
+# Create filter for GCN and convert to sparse tensor.
+adj = data.a
+adj = GraphConv.preprocess(adj)
+adj = sp_matrix_to_sp_tensor(adj)
+
+# Train/valid/test split
+data_tr, data_te = data[:-10000], data[-10000:]
+np.random.shuffle(data_tr)
+data_tr, data_va = data_tr[:-10000], data_tr[-10000:]
 
 
 # Build model
@@ -39,78 +37,64 @@ def __init__(self, **kwargs):
         self.conv2 = GraphConv(32, activation='elu', kernel_regularizer=l2(l2_reg))
         self.flatten = Flatten()
         self.fc1 = Dense(512, activation='relu')
-        self.fc2 = Dense(n_out, activation='softmax')
+        self.fc2 = Dense(10, activation='softmax')  # MNIST has 10 classes
 
     def call(self, inputs):
-        x, fltr = inputs
-        x = self.conv1([x, fltr])
-        x = self.conv2([x, fltr])
+        x, a = inputs
+        x = self.conv1([x, a])
+        x = self.conv2([x, a])
         output = self.flatten(x)
         output = self.fc1(output)
         output = self.fc2(output)
 
         return output
 
-
+# Create model
 model = Net()
-optimizer = Adam(lr=learning_rate)
-loss_fn = SparseCategoricalCrossentropy()
-acc_fn = SparseCategoricalAccuracy()
-
+model.compile('adam', 'sparse_categorical_crossentropy',
+              metrics=['sparse_categorical_accuracy'])
 
-# Training step
-@tf.function
-def train(x, y):
-    with tf.GradientTape() as tape:
-        predictions = model([x, fltr], training=True)
-        loss = loss_fn(y, predictions)
-        loss += sum(model.losses)
-    acc = acc_fn(y, predictions)
-    gradients = tape.gradient(loss, model.trainable_variables)
-    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
-    return loss, acc
 
-
-# Evaluation step
-@tf.function
-def evaluate(x, y):
-    predictions = model([x, fltr], training=False)
-    loss = loss_fn(y, predictions)
-    loss += sum(model.losses)
-    acc = acc_fn(y, predictions)
-
-    return loss, acc
+# Evaluation function
+def evaluate(loader):
+    step = 0
+    results = []
+    for batch in loader:
+        step += 1
+        x, y = batch
+        l, a = model.test_on_batch([x, adj], y)
+        results.append((l, a))
+        if step == loader.steps_per_epoch:
+            return np.mean(results, 0)
 
 
 # Setup training
 best_val_loss = 99999
 current_patience = patience
-curent_batch = 0
-batches_in_epoch = int(np.ceil(x_tr.shape[0] / batch_size))
-batches_tr = batch_generator([x_tr, y_tr], batch_size=batch_size, epochs=epochs)
+step = 0
+
+# We can use PackedBatchLoader because we only need to create batches of node
+# features with the same dimensions.
+loader_tr = PackedBatchLoader(data_tr, batch_size=batch_size, epochs=epochs)
+loader_va = PackedBatchLoader(data_va, batch_size=batch_size)
+loader_te = PackedBatchLoader(data_te, batch_size=batch_size)
 
 # Training loop
 results_tr = []
-results_te = np.zeros(2)
-for batch in batches_tr:
-    curent_batch += 1
+for batch in loader_tr:
+    step += 1
 
     # Training step
-    l, a = train(*batch)
+    x, y = batch
+    l, a = model.train_on_batch([x, adj], y)
     results_tr.append((l, a))
 
-    if curent_batch == batches_in_epoch:
-        batches_va = batch_generator([x_va, y_va], batch_size=batch_size, epochs=1)
-        results_va = [evaluate(*batch) for batch in batches_va]
-        results_va = np.array(results_va)
-        loss_va, acc_va = results_va.mean(0)
-        if loss_va < best_val_loss:
-            best_val_loss = loss_va
+    if step == loader_tr.steps_per_epoch:
+        results_va = evaluate(loader_va)
+        if results_va[0] < best_val_loss:
+            best_val_loss = results_va[0]
             current_patience = patience
-            # Test
-            batches_te = batch_generator([x_te, y_te], batch_size=batch_size, epochs=1)
-            results_te = [evaluate(*batch) for batch in batches_te]
-            results_te = np.array(results_te)
+            results_te = evaluate(loader_te)
         else:
             current_patience -= 1
             if current_patience == 0:
@@ -118,14 +102,12 @@ def evaluate(x, y):
                 break
 
         # Print results
-        results_tr = np.array(results_tr)
+        results_tr = np.mean(results_tr, 0)
         print('Train loss: {:.4f}, acc: {:.4f} | '
               'Valid loss: {:.4f}, acc: {:.4f} | '
               'Test loss: {:.4f}, acc: {:.4f}'
-              .format(*results_tr.mean(0),
-                      *results_va.mean(0),
-                      *results_te.mean(0)))
+              .format(*results_tr, *results_va, *results_te))
 
         # Reset epoch
         results_tr = []
-        curent_batch = 0
+        step = 0
diff --git a/spektral/data/loaders.py b/spektral/data/loaders.py
index 6aca53a7..ca587fa4 100644
--- a/spektral/data/loaders.py
+++ b/spektral/data/loaders.py
@@ -325,6 +325,8 @@ def collate(self, batch):
         packed = self._pack(batch)
         y = np.array(packed[-1])
         output = to_batch(*packed[:-1])
+        if len(output) == 1:
+            output = output[0]
 
         return output, y
 
@@ -368,4 +370,7 @@ def __init__(self, dataset, batch_size=1, epochs=None, shuffle=True):
         self._generator = self.generator()
 
     def collate(self, batch):
-        return batch[:-1], batch[-1]
\ No newline at end of file
+        if len(batch) == 2:
+            return batch[0], batch[1]
+        else:
+            return batch[:-1], batch[-1]
\ No newline at end of file
diff --git a/spektral/data/utils.py b/spektral/data/utils.py
index f66569b1..111c9eb5 100644
--- a/spektral/data/utils.py
+++ b/spektral/data/utils.py
@@ -5,71 +5,74 @@
 from spektral.utils import pad_jagged_array
 
 
-def _check_input(x_list, a_list, e_list=None):
-    if not len(x_list) == len(a_list):
-        raise ValueError('x_list and a_list must have the same length')
-    if e_list is not None and len(e_list) != len(x_list):
-        raise ValueError('x_list, a_list, and e_list must have the same length')
-    if len(x_list) < 1:
-        raise ValueError('Need at least one graph')
-
-
-def to_disjoint(x_list, a_list, e_list=None):
+def to_disjoint(x_list=None, a_list=None, e_list=None):
     """
-    Converts lists of node features, adjacency matrices and (optionally) edge
-    features to [disjoint mode](https://danielegrattarola.github.io/spektral/data/#disjoint-mode).
+    Converts lists of node features, adjacency matrices and edge features to
+    [disjoint mode](https://danielegrattarola.github.io/spektral/data/#disjoint-mode).
+
+    Either the node features or the adjacency matrices must be provided as input.
 
     The i-th element of each list must be associated with the i-th graph.
 
     The method also computes the batch index to retrieve individual graphs
     from the disjoint union.
 
-    The edge attributes of a graph can be represented as
+    Edge attributes can be represented as:
 
     - a dense array of shape `(n_nodes, n_nodes, n_edge_features)`;
     - a sparse edge list of shape `(n_edges, n_edge_features)`;
 
     and they will always be returned as edge list for efficiency.
 
-    :param x_list: a list of np.arrays of shape `(n_nodes, n_node_features)` -- note that `n_nodes` can
-    change between graphs;
+    :param x_list: a list of np.arrays of shape `(n_nodes, n_node_features)`
+    -- note that `n_nodes` can change between graphs;
     :param a_list: a list of np.arrays or scipy.sparse matrices of shape
     `(n_nodes, n_nodes)`;
-    :param e_list: a list of np.arrays of shape `(n_nodes, n_nodes, n_edge_features)` or `(n_edges, n_edge_features)`;
-    :return:
+    :param e_list: a list of np.arrays of shape
+    `(n_nodes, n_nodes, n_edge_features)` or `(n_edges, n_edge_features)`;
+    :return: only if the corresponding list is given as input:
+
         -  `x`: np.array of shape `(n_nodes, n_node_features)`;
         -  `a`: scipy.sparse matrix of shape `(n_nodes, n_nodes)`;
         -  `e`: (optional) np.array of shape `(n_edges, n_edge_features)`;
         -  `i`: np.array of shape `(n_nodes, )`;
     """
-    _check_input(x_list, a_list, e_list)
+    if a_list is None and x_list is None:
+        raise ValueError('Need at least x_list or a_list.')
 
     # Node features
-    x_out = np.vstack(x_list)
+    x_out = None
+    if x_list is not None:
+        x_out = np.vstack(x_list)
 
     # Adjacency matrix
-    a_out = sp.block_diag(a_list)
+    a_out = None
+    if a_list is not None:
+        a_out = sp.block_diag(a_list)
 
     # Batch index
-    n_nodes = np.array([x.shape[0] for x in x_list])
+    n_nodes = np.array(
+        [x.shape[0] for x in (x_list if x_list is not None else a_list)])
     i_out = np.repeat(np.arange(len(n_nodes)), n_nodes)
 
     # Edge attributes
+    e_out = None
     if e_list is not None:
         if e_list[0].ndim == 3:  # Convert dense to sparse
             e_list = [e[sp.find(a)[:-1]] for e, a in zip(e_list, a_list)]
         e_out = np.vstack(e_list)
-        return x_out, a_out, e_out, i_out
-    else:
-        return x_out, a_out, i_out
+
+    return tuple(out for out in [x_out, a_out, e_out, i_out] if out is not None)
 
 
-def to_batch(x_list, a_list, e_list=None):
+def to_batch(x_list=None, a_list=None, e_list=None):
     """
     Converts lists of node features, adjacency matrices and (optionally) edge 
     features to [batch mode](https://danielegrattarola.github.io/spektral/data/#batch-mode),
     by zero-padding all tensors to have the same node dimension `n_max`.
 
+    Either the node features or the adjacency matrices must be provided as input.
+
     The i-th element of each list must be associated with the i-th graph.
 
     If `a_list` contains sparse matrices, they will be converted to dense
@@ -82,29 +85,37 @@ def to_batch(x_list, a_list, e_list=None):
 
     and they will always be returned as dense arrays.
 
-    :param x_list: a list of np.arrays of shape `(n_nodes, n_node_features)` -- note that `n_nodes` can
-    change between graphs;
+    :param x_list: a list of np.arrays of shape `(n_nodes, n_node_features)`
+    -- note that `n_nodes` can change between graphs;
     :param a_list: a list of np.arrays or scipy.sparse matrices of shape
     `(n_nodes, n_nodes)`;
-    :param e_list: a list of np.arrays of shape `(n_nodes, n_nodes, n_edge_features)`;
-    :return:
+    :param e_list: a list of np.arrays of shape
+    `(n_nodes, n_nodes, n_edge_features)` or `(n_edges, n_edge_features)`;
+    :return: only if the corresponding list is given as input:
+
         -  `x`: np.array of shape `(batch, n_max, n_node_features)`;
         -  `a`: np.array of shape `(batch, n_max, n_max)`;
-        -  `e`: (only if `e_list` is given) np.array of shape
-        `(batch, n_max, n_max, n_edge_features)`;
+        -  `e`: np.array of shape `(batch, n_max, n_max, n_edge_features)`;
     """
-    _check_input(x_list, a_list, e_list)
-    n_max = max([a.shape[-1] for a in a_list])
+    if a_list is None and x_list is None:
+        raise ValueError('Need at least x_list or a_list')
+
+    n_max = max([x.shape[0] for x in (x_list if x_list is not None else a_list)])
 
     # Node features
-    x_out = pad_jagged_array(x_list, (n_max, -1))
+    x_out = None
+    if x_list is not None:
+        x_out = pad_jagged_array(x_list, (n_max, -1))
 
     # Adjacency matrix
-    if hasattr(a_list[0], 'toarray'):  # Convert sparse to dense
-        a_list = [a.toarray() for a in a_list]
-    a_out = pad_jagged_array(a_list, (n_max, n_max))
+    a_out = None
+    if a_list is not None:
+        if hasattr(a_list[0], 'toarray'):  # Convert sparse to dense
+            a_list = [a.toarray() for a in a_list]
+        a_out = pad_jagged_array(a_list, (n_max, n_max))
 
     # Edge attributes
+    e_out = None
     if e_list is not None:
         if e_list[0].ndim == 2:  # Sparse to dense
             for i in range(len(a_list)):
@@ -113,9 +124,9 @@ def to_batch(x_list, a_list, e_list=None):
                 e_new[np.nonzero(a)] = e
                 e_list[i] = e_new
         e_out = pad_jagged_array(e_list, (n_max, n_max, -1))
-        return x_out, a_out, e_out
-    else:
-        return x_out, a_out
+
+    return tuple(out for out in [x_out, a_out, e_out] if out is not None)
+
 
 
 def batch_generator(data, batch_size=32, epochs=None, shuffle=True):
diff --git a/spektral/datasets/__init__.py b/spektral/datasets/__init__.py
index a91f2f61..38387b2d 100644
--- a/spektral/datasets/__init__.py
+++ b/spektral/datasets/__init__.py
@@ -1,6 +1,6 @@
 from .citation import Citation
 from .graphsage import GraphSage, PPI, Reddit
-from . import mnist
+from .mnist import MNIST
 from .ogb import OGB
 from .qm9 import QM9
 from .tudataset import TUDataset
diff --git a/spektral/datasets/citation.py b/spektral/datasets/citation.py
index 706c1a79..e3a038e8 100644
--- a/spektral/datasets/citation.py
+++ b/spektral/datasets/citation.py
@@ -19,8 +19,9 @@ class Citation(Dataset):
     in the text document associated to each node.
     Two papers are connected if either one cites the other.
     Labels represent the subject area of the paper.
+
     The train, test, and validation splits are given as binary masks and are
-    accessible with the `mask_tr`, `mask_va`, and `mask_te` respectively.
+    accessible via the `mask_tr`, `mask_va`, and `mask_te` attributes.
 
     **Arguments**
 
diff --git a/spektral/datasets/graphsage.py b/spektral/datasets/graphsage.py
index 2a111749..314d264d 100644
--- a/spektral/datasets/graphsage.py
+++ b/spektral/datasets/graphsage.py
@@ -31,7 +31,7 @@ class GraphSage(Dataset):
     the title and comments, the post's score and the number of comments.
 
     The train, test, and validation splits are given as binary masks and are
-    accessible with the `mask_tr`, `mask_va`, and `mask_te` respectively.
+    accessible via the `mask_tr`, `mask_va`, and `mask_te` attributes.
 
     **Arguments**
 
diff --git a/spektral/datasets/mnist.py b/spektral/datasets/mnist.py
index 9aecae55..469ee715 100644
--- a/spektral/datasets/mnist.py
+++ b/spektral/datasets/mnist.py
@@ -1,57 +1,60 @@
-"""
-This code is largely take from M. Defferrard's Github
-https://github.com/mdeff/cnn_graph/blob/master/nips2016/mnist.ipynb.
-"""
-
 import numpy as np
 import scipy.sparse as sp
-from sklearn.model_selection import train_test_split
 from sklearn.neighbors import kneighbors_graph
 from tensorflow.keras.datasets import mnist as m
 
+from spektral.data import Dataset, Graph
+
 MNIST_SIZE = 28
 
 
-def load_data(k=8, noise_level=0.0):
+class MNIST(Dataset):
     """
-    Loads the MNIST dataset and a K-NN graph to perform graph signal
-    classification, as described by [Defferrard et al. (2016)](https://arxiv.org/abs/1606.09375).
-    The K-NN graph is statically determined from a regular grid of pixels using
-    the 2d coordinates.
+    The MNIST dataset used as node features for a grid graph, as described by
+    [Defferrard et al. (2016)](https://arxiv.org/abs/1606.09375).
+
+    This dataset is a graph signal classification task, where graphs are
+    represented in mixed mode: one adjacency matrix, many instances of node
+    features.
+
+    For efficiency, the adjacency matrix is stored in a special attribute of the
+    dataset and the Graphs only contain the node features.
+    You can access the adjacency matrix via the `a` attribute.
 
     The node features of each graph are the MNIST digits vectorized and rescaled
     to [0, 1].
-    Two nodes are connected if they are neighbours according to the K-NN graph.
-    Labels are the MNIST class associated to each sample.
+    Two nodes are connected if they are neighbours on the grid.
+    Labels represent the MNIST class associated to each sample.
 
-    :param k: int, number of neighbours for each node;
-    :param noise_level: fraction of edges to flip (from 0 to 1 and vice versa);
+    **Note:** the last 10000 samples are the default test set of the MNIST
+    dataset.
 
-    :return:
-        - X_train, y_train: training node features and labels;
-        - X_val, y_val: validation node features and labels;
-        - X_test, y_test: test node features and labels;
-        - A: adjacency matrix of the grid;
+    **Arguments**
+
+    - `p_flip`: if >0, then edges are randomly flipped from 0 to 1 or vice versa
+    with that probability.
+    - `k`: number of neighbours of each node.
     """
-    A = _mnist_grid_graph(k)
-    A = _flip_random_edges(A, noise_level).astype(np.float32)
+    def __init__(self, p_flip=0., k=8, **kwargs):
+        self.a = None
+        self.k = k
+        self.p_flip = p_flip
+        super().__init__(**kwargs)
 
-    (X_train, y_train), (X_test, y_test) = m.load_data()
-    X_train, X_test = X_train / 255.0, X_test / 255.0
-    X_train = X_train.reshape(-1, MNIST_SIZE ** 2)
-    X_test = X_test.reshape(-1, MNIST_SIZE ** 2)
+    def read(self):
+        self.a = _mnist_grid_graph(self.k)
+        self.a = _flip_random_edges(self.a, self.p_flip)
 
-    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=10000)
+        (x_train, y_train), (x_test, y_test) = m.load_data()
+        x = np.vstack((x_train, x_test))
+        x = x / 255.
+        y = np.concatenate((y_train, y_test), 0)
+        x = x.reshape(-1, MNIST_SIZE ** 2, 1)
 
-    return X_train, y_train, X_val, y_val, X_test, y_test, A
+        return [Graph(x=x_, y=y_) for x_, y_ in zip(x, y)]
 
 
 def _grid_coordinates(side):
-    """
-    Returns 2D coordinates for a square grid of equally spaced nodes.
-    :param side: int, the side of the grid (i.e., the grid has side * side nodes).
-    :return: np.array of shape (side * side, 2).
-    """
     M = side ** 2
     x = np.linspace(0, 1, side, dtype=np.float32)
     y = np.linspace(0, 1, side, dtype=np.float32)
@@ -63,13 +66,6 @@ def _grid_coordinates(side):
 
 
 def _get_adj_from_data(X, k, **kwargs):
-    """
-    Computes adjacency matrix of a K-NN graph from the given data.
-    :param X: rank 1 np.array, the 2D coordinates of pixels on the grid.
-    :param kwargs: kwargs for sklearn.neighbors.kneighbors_graph (see docs
-    [here](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.kneighbors_graph.html)).
-    :return: scipy sparse matrix.
-    """
     A = kneighbors_graph(X, k, **kwargs).toarray()
     A = sp.csr_matrix(np.maximum(A, A.T))
 
@@ -77,11 +73,6 @@ def _get_adj_from_data(X, k, **kwargs):
 
 
 def _mnist_grid_graph(k):
-    """
-    Get the adjacency matrix for the KNN graph.
-    :param k: int, number of neighbours for each node;
-    :return:
-    """
     X = _grid_coordinates(MNIST_SIZE)
     A = _get_adj_from_data(
         X, k, mode='connectivity', metric='euclidean', include_self=False
@@ -90,19 +81,13 @@ def _mnist_grid_graph(k):
     return A
 
 
-def _flip_random_edges(A, percent):
-    """
-    Flips values of A randomly.
-    :param A: binary scipy sparse matrix.
-    :param percent: percent of the edges to flip.
-    :return: binary scipy sparse matrix.
-    """
+def _flip_random_edges(A, p_swap):
     if not A.shape[0] == A.shape[1]:
         raise ValueError('A must be a square matrix.')
     dtype = A.dtype
     A = sp.lil_matrix(A).astype(np.bool)
     n_elem = A.shape[0] ** 2
-    n_elem_to_flip = round(percent * n_elem)
+    n_elem_to_flip = round(p_swap * n_elem)
     unique_idx = np.random.choice(n_elem, replace=False, size=n_elem_to_flip)
     row_idx = unique_idx // A.shape[0]
     col_idx = unique_idx % A.shape[0]
diff --git a/tests/test_data/test_dataset.py b/tests/test_data/test_dataset.py
index 75c9b147..40153211 100644
--- a/tests/test_data/test_dataset.py
+++ b/tests/test_data/test_dataset.py
@@ -11,11 +11,11 @@
 
 class TestDataset(Dataset):
     def read(self):
-        return [
-            Graph(x=np.random.rand(n, f), a=np.random.randint(0, 2, (n, n)), e=np.random.rand(n, n, s),
-                  y=np.array([0., 1.]))
-            for n in Ns
-        ]
+        return [Graph(x=np.random.rand(n, f),
+                      a=np.random.randint(0, 2, (n, n)),
+                      e=np.random.rand(n, n, s),
+                      y=np.array([0., 1.]))
+                for n in Ns]
 
 
 def test_dataset():
@@ -36,7 +36,9 @@ def test_dataset():
 
     # __setitem__
     n = 100
-    g = Graph(x=np.random.rand(n, f), a=np.random.randint(0, 2, (n, n)), e=np.random.rand(n, n, s),
+    g = Graph(x=np.random.rand(n, f),
+              a=np.random.randint(0, 2, (n, n)),
+              e=np.random.rand(n, n, s),
               y=np.array([0., 1.]))
 
     # single assignment
diff --git a/tests/test_data/test_utils.py b/tests/test_data/test_utils.py
index abbe4bb4..1c1b7427 100644
--- a/tests/test_data/test_utils.py
+++ b/tests/test_data/test_utils.py
@@ -1,9 +1,8 @@
 import numpy as np
+import scipy.sparse as sp
 
 from spektral.data import Dataset, Graph
 from spektral.data.utils import to_disjoint, to_batch, batch_generator
-import scipy.sparse as sp
-import numpy as np
 
 ns = np.random.randint(3, 10, 10)
 f = 3
@@ -14,6 +13,8 @@
 
 def test_to_batch():
     # TODO test e_list
+    x = to_batch(x_list=x_list)
+    a = to_batch(a_list=a_list)
     x, a = to_batch(x_list, a_list)
     assert x.ndim == 3
     assert a.ndim == 3
@@ -23,6 +24,8 @@ def test_to_batch():
 
 def test_to_disjoint():
     # TODO test e_list
+    x, i = to_disjoint(x_list, None)
+    a, i = to_disjoint(None, a_list)
     x, a, i = to_disjoint(x_list, a_list)
     assert x.ndim == 2
     assert a.ndim == 2
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index 69a6b855..7e49bb46 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -1,4 +1,4 @@
-from spektral.data import DisjointLoader, BatchLoader
+from spektral.data import DisjointLoader, BatchLoader, SingleLoader
 from spektral.datasets import qm9, citation, graphsage, mnist, tudataset
 
 batch_size = 3
@@ -8,15 +8,17 @@ def test_citation():
     dataset = citation.Cora()
     dataset = citation.Citeseer(random_split=True)
     dataset = citation.Pubmed(normalize_x=True)
+    sl = SingleLoader(dataset)
 
 
 def test_graphsage():
     # Test only PPI because Travis otherwise runs into memory errors
     dataset = graphsage.PPI()
+    sl = SingleLoader(dataset)
 
 
 def test_mnist():
-    mnist.load_data(k=8, noise_level=0.1)
+    dataset = mnist.MNIST(k=8, noise_level=0.1)
 
 
 def test_qm9():

From 6afb6de2875cc511ecc712892ebe44f3e1f42e7e Mon Sep 17 00:00:00 2001
From: Daniele Grattarola <daniele.grattarola@gmail.com>
Date: Tue, 24 Nov 2020 18:29:40 +0100
Subject: [PATCH 39/57] Docs

---
 spektral/data/loaders.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/spektral/data/loaders.py b/spektral/data/loaders.py
index ca587fa4..65fce8bd 100644
--- a/spektral/data/loaders.py
+++ b/spektral/data/loaders.py
@@ -359,6 +359,9 @@ class PackedBatchLoader(BatchLoader):
     - the graphs in the dataset have similar sizes and there are no outliers in
     the dataset (i.e., anomalous graphs with many more nodes than the dataset
     average).
+
+    This loader is also useful for loading mixed-mode datsets, because it
+    allows to create "standard" batches of node features with almost no overhead.
     """
     def __init__(self, dataset, batch_size=1, epochs=None, shuffle=True):
         super().__init__(dataset, batch_size=batch_size, epochs=epochs, shuffle=shuffle)

From 6759ceacff674374147c5687e3e137f3d892017f Mon Sep 17 00:00:00 2001
From: Alessio Zanga <alessio.zanga@outlook.it>
Date: Tue, 24 Nov 2020 17:41:21 +0100
Subject: [PATCH 40/57] Fix missing return_attn_coef in get_config

Missing return_attn_coef in get_config create a mismatch during save & load of models.
(cherry picked from commit 6a7c222349d3bb712c74d70738d848f0f67069ac)
---
 spektral/layers/convolutional/graph_attention.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/spektral/layers/convolutional/graph_attention.py b/spektral/layers/convolutional/graph_attention.py
index eacaa904..748b595e 100644
--- a/spektral/layers/convolutional/graph_attention.py
+++ b/spektral/layers/convolutional/graph_attention.py
@@ -256,6 +256,7 @@ def get_config(self):
             'attn_heads': self.attn_heads,
             'concat_heads': self.concat_heads,
             'dropout_rate': self.dropout_rate,
+            'return_attn_coef': self.return_attn_coef,
             'attn_kernel_initializer': initializers.serialize(self.attn_kernel_initializer),
             'attn_kernel_regularizer': regularizers.serialize(self.attn_kernel_regularizer),
             'attn_kernel_constraint': constraints.serialize(self.attn_kernel_constraint),

From 303d37ad64570c0a47765e39131cc083ea09cd88 Mon Sep 17 00:00:00 2001
From: Daniele Grattarola <daniele.grattarola@gmail.com>
Date: Wed, 25 Nov 2020 11:15:42 +0100
Subject: [PATCH 41/57] Standardize naming for layers to *Conv and *Pool

---
 docs/autogen.py                               | 10 +++---
 docs/templates/getting-started.md             |  2 +-
 examples/graph_prediction/custom_dataset.py   |  8 ++---
 .../graph_prediction/ogbg-mol-esol_batch.py   |  6 ++--
 .../graph_prediction/ogbg-mol-hiv_disjoint.py |  6 ++--
 examples/graph_prediction/qm9_batch.py        |  6 ++--
 examples/graph_prediction/qm9_disjoint.py     |  6 ++--
 examples/node_prediction/citation_gat.py      | 36 +++++++++----------
 examples/node_prediction/citation_gat_fast.py | 36 +++++++++----------
 examples/node_prediction/citation_gcn.py      | 18 +++++-----
 examples/node_prediction/citation_gcn_fast.py |  8 ++---
 .../node_prediction/citation_simple_gc.py     | 12 +++----
 examples/node_prediction/ogbn-arxiv_gcn.py    |  8 ++---
 .../graph_signal_classification_mnist.py      |  8 ++---
 examples/other/node_clustering_mincut.py      |  4 +--
 spektral/layers/convolutional/__init__.py     | 10 +++---
 .../convolutional/{appnp.py => appnp_conv.py} |  4 +--
 spektral/layers/convolutional/arma_conv.py    |  4 +--
 spektral/layers/convolutional/cheb_conv.py    |  4 +--
 .../layers/convolutional/diffusion_conv.py    |  4 +--
 spektral/layers/convolutional/ecc_conv.py     |  4 +--
 .../{graph_attention.py => gat_conv.py}       |  4 +--
 .../{graph_conv.py => gcn_conv.py}            |  2 +-
 .../{graph_conv_skip.py => gcs_conv.py}       |  4 +--
 .../layers/convolutional/graphsage_conv.py    |  4 +--
 spektral/layers/pooling/global_pool.py        | 12 +++----
 tests/test_layers/test_convolutional.py       | 14 ++++----
 tests/test_transforms/test_transforms.py      |  4 +--
 28 files changed, 124 insertions(+), 124 deletions(-)
 rename spektral/layers/convolutional/{appnp.py => appnp_conv.py} (98%)
 rename spektral/layers/convolutional/{graph_attention.py => gat_conv.py} (99%)
 rename spektral/layers/convolutional/{graph_conv.py => gcn_conv.py} (99%)
 rename spektral/layers/convolutional/{graph_conv_skip.py => gcs_conv.py} (97%)

diff --git a/docs/autogen.py b/docs/autogen.py
index 95163beb..390bc9ad 100644
--- a/docs/autogen.py
+++ b/docs/autogen.py
@@ -36,14 +36,14 @@
     {
         'page': 'layers/convolution.md',
         'classes': [
-            layers.GraphConv,
+            layers.GCNConv,
             layers.ChebConv,
             layers.GraphSageConv,
             layers.ARMAConv,
-            layers.EdgeConditionedConv,
-            layers.GraphAttention,
-            layers.GraphConvSkip,
-            layers.APPNP,
+            layers.ECCConv,
+            layers.GATConv,
+            layers.GCSConv,
+            layers.APPNPConv,
             layers.GINConv,
             layers.DiffusionConv,
             layers.GatedGraphConv,
diff --git a/docs/templates/getting-started.md b/docs/templates/getting-started.md
index d328beed..75c8a693 100644
--- a/docs/templates/getting-started.md
+++ b/docs/templates/getting-started.md
@@ -123,7 +123,7 @@ Let's start by importing the necessary layers:
 ```python
 from tensorflow.keras.models import Model
 from tensorflow.keras.layers import Dense, Dropout
-from spektral.layers import GraphConv, GlobalSumPool
+from spektral.layers import GCNConv, GlobalSumPool
 ```
 
 Now we can use model subclassing to define our model:
diff --git a/examples/graph_prediction/custom_dataset.py b/examples/graph_prediction/custom_dataset.py
index b6e7e9ca..dae18a7c 100644
--- a/examples/graph_prediction/custom_dataset.py
+++ b/examples/graph_prediction/custom_dataset.py
@@ -26,7 +26,7 @@
 from tensorflow.keras.optimizers import Adam
 
 from spektral.data import Dataset, Graph, DisjointLoader
-from spektral.layers import GraphConvSkip, GlobalAvgPool
+from spektral.layers import GCSConv, GlobalAvgPool
 from spektral.layers.pooling import TopKPool
 from spektral.transforms.normalize_adj import NormalizeAdj
 
@@ -106,11 +106,11 @@ def make_graph():
 A_in = Input(shape=(None,), sparse=True)
 I_in = Input(shape=(), name='segment_ids_in', dtype=tf.int32)
 
-X_1 = GraphConvSkip(32, activation='relu')([X_in, A_in])
+X_1 = GCSConv(32, activation='relu')([X_in, A_in])
 X_1, A_1, I_1 = TopKPool(ratio=0.5)([X_1, A_in, I_in])
-X_2 = GraphConvSkip(32, activation='relu')([X_1, A_1])
+X_2 = GCSConv(32, activation='relu')([X_1, A_1])
 X_2, A_2, I_2 = TopKPool(ratio=0.5)([X_2, A_1, I_1])
-X_3 = GraphConvSkip(32, activation='relu')([X_2, A_2])
+X_3 = GCSConv(32, activation='relu')([X_2, A_2])
 X_3 = GlobalAvgPool()([X_3, I_2])
 output = Dense(n_out, activation='softmax')(X_3)
 
diff --git a/examples/graph_prediction/ogbg-mol-esol_batch.py b/examples/graph_prediction/ogbg-mol-esol_batch.py
index 85a9e4e5..ea4a55e9 100644
--- a/examples/graph_prediction/ogbg-mol-esol_batch.py
+++ b/examples/graph_prediction/ogbg-mol-esol_batch.py
@@ -14,7 +14,7 @@
 
 from spektral.data import BatchLoader
 from spektral.datasets import OGB
-from spektral.layers import GraphConv, MinCutPool, GlobalSumPool
+from spektral.layers import GCNConv, MinCutPool, GlobalSumPool
 
 ################################################################################
 # PARAMETERS
@@ -49,9 +49,9 @@
 X_in = Input(shape=(None, F))
 A_in = Input(shape=(None, None))
 
-X_1 = GraphConv(32, activation='relu')([X_in, A_in])
+X_1 = GCNConv(32, activation='relu')([X_in, A_in])
 X_1, A_1 = MinCutPool(N // 2)([X_1, A_in])
-X_2 = GraphConv(32, activation='relu')([X_1, A_1])
+X_2 = GCNConv(32, activation='relu')([X_1, A_1])
 X_3 = GlobalSumPool()(X_2)
 output = Dense(n_out)(X_3)
 
diff --git a/examples/graph_prediction/ogbg-mol-hiv_disjoint.py b/examples/graph_prediction/ogbg-mol-hiv_disjoint.py
index c5ce5f9f..7ce87147 100644
--- a/examples/graph_prediction/ogbg-mol-hiv_disjoint.py
+++ b/examples/graph_prediction/ogbg-mol-hiv_disjoint.py
@@ -16,7 +16,7 @@
 
 from spektral.data import DisjointLoader
 from spektral.datasets import OGB
-from spektral.layers import EdgeConditionedConv, GlobalSumPool
+from spektral.layers import ECCConv, GlobalSumPool
 
 ################################################################################
 # PARAMETERS
@@ -55,8 +55,8 @@
 E_in = Input(shape=(S,))
 I_in = Input(shape=(), dtype=tf.int64)
 
-X_1 = EdgeConditionedConv(32, activation='relu')([X_in, A_in, E_in])
-X_2 = EdgeConditionedConv(32, activation='relu')([X_1, A_in, E_in])
+X_1 = ECCConv(32, activation='relu')([X_in, A_in, E_in])
+X_2 = ECCConv(32, activation='relu')([X_1, A_in, E_in])
 X_3 = GlobalSumPool()([X_2, I_in])
 output = Dense(n_out, activation='sigmoid')(X_3)
 
diff --git a/examples/graph_prediction/qm9_batch.py b/examples/graph_prediction/qm9_batch.py
index 1618baac..d601973c 100644
--- a/examples/graph_prediction/qm9_batch.py
+++ b/examples/graph_prediction/qm9_batch.py
@@ -10,7 +10,7 @@
 
 from spektral.data import BatchLoader
 from spektral.datasets import QM9
-from spektral.layers import EdgeConditionedConv, GlobalSumPool
+from spektral.layers import ECCConv, GlobalSumPool
 
 ################################################################################
 # PARAMETERS
@@ -42,8 +42,8 @@
 A_in = Input(shape=(None, None))
 E_in = Input(shape=(None, None, S))
 
-X_1 = EdgeConditionedConv(32, activation='relu')([X_in, A_in, E_in])
-X_2 = EdgeConditionedConv(32, activation='relu')([X_1, A_in, E_in])
+X_1 = ECCConv(32, activation='relu')([X_in, A_in, E_in])
+X_2 = ECCConv(32, activation='relu')([X_1, A_in, E_in])
 X_3 = GlobalSumPool()(X_2)
 output = Dense(n_out)(X_3)
 
diff --git a/examples/graph_prediction/qm9_disjoint.py b/examples/graph_prediction/qm9_disjoint.py
index c7e64e24..e8a0523f 100644
--- a/examples/graph_prediction/qm9_disjoint.py
+++ b/examples/graph_prediction/qm9_disjoint.py
@@ -12,7 +12,7 @@
 
 from spektral.data import DisjointLoader
 from spektral.datasets import QM9
-from spektral.layers import EdgeConditionedConv, GlobalSumPool
+from spektral.layers import ECCConv, GlobalSumPool
 
 ################################################################################
 # PARAMETERS
@@ -48,8 +48,8 @@
 E_in = Input(shape=(S,), name='E_in')
 I_in = Input(shape=(), name='segment_ids_in', dtype=tf.int32)
 
-X_1 = EdgeConditionedConv(32, activation='relu')([X_in, A_in, E_in])
-X_2 = EdgeConditionedConv(32, activation='relu')([X_1, A_in, E_in])
+X_1 = ECCConv(32, activation='relu')([X_in, A_in, E_in])
+X_2 = ECCConv(32, activation='relu')([X_1, A_in, E_in])
 X_3 = GlobalSumPool()([X_2, I_in])
 output = Dense(n_out)(X_3)
 
diff --git a/examples/node_prediction/citation_gat.py b/examples/node_prediction/citation_gat.py
index 8ff32203..a8a0d6c9 100644
--- a/examples/node_prediction/citation_gat.py
+++ b/examples/node_prediction/citation_gat.py
@@ -13,12 +13,12 @@
 
 from spektral.data.loaders import SingleLoader
 from spektral.datasets.citation import Citation
-from spektral.layers import GraphAttention
+from spektral.layers import GATConv
 from spektral.transforms import LayerPreprocess, AdjToSpTensor
 
 # Load data
 dataset = Citation('cora',
-                   transforms=[LayerPreprocess(GraphAttention), AdjToSpTensor()])
+                   transforms=[LayerPreprocess(GATConv), AdjToSpTensor()])
 mask_tr, mask_va, mask_te = dataset.mask_tr, dataset.mask_va, dataset.mask_te
 
 # Parameters
@@ -40,23 +40,23 @@
 a_in = Input((N,), sparse=True, dtype=a_dtype)
 
 do_1 = Dropout(dropout)(x_in)
-gc_1 = GraphAttention(channels,
-                      attn_heads=n_attn_heads,
-                      concat_heads=True,
-                      dropout_rate=dropout,
-                      activation='elu',
-                      kernel_regularizer=l2(l2_reg),
-                      attn_kernel_regularizer=l2(l2_reg)
-                      )([do_1, a_in])
+gc_1 = GATConv(channels,
+               attn_heads=n_attn_heads,
+               concat_heads=True,
+               dropout_rate=dropout,
+               activation='elu',
+               kernel_regularizer=l2(l2_reg),
+               attn_kernel_regularizer=l2(l2_reg)
+               )([do_1, a_in])
 do_2 = Dropout(dropout)(gc_1)
-gc_2 = GraphAttention(n_out,
-                      attn_heads=1,
-                      concat_heads=False,
-                      dropout_rate=dropout,
-                      activation='softmax',
-                      kernel_regularizer=l2(l2_reg),
-                      attn_kernel_regularizer=l2(l2_reg)
-                      )([do_2, a_in])
+gc_2 = GATConv(n_out,
+               attn_heads=1,
+               concat_heads=False,
+               dropout_rate=dropout,
+               activation='softmax',
+               kernel_regularizer=l2(l2_reg),
+               attn_kernel_regularizer=l2(l2_reg)
+               )([do_2, a_in])
 
 # Build model
 model = Model(inputs=[x_in, a_in], outputs=gc_2)
diff --git a/examples/node_prediction/citation_gat_fast.py b/examples/node_prediction/citation_gat_fast.py
index 02e7b4d1..c01fed50 100644
--- a/examples/node_prediction/citation_gat_fast.py
+++ b/examples/node_prediction/citation_gat_fast.py
@@ -13,12 +13,12 @@
 from tensorflow.keras.regularizers import l2
 
 from spektral.datasets.citation import Cora
-from spektral.layers import GraphAttention
+from spektral.layers import GATConv
 from spektral.transforms import LayerPreprocess, AdjToSpTensor
 from spektral.utils import tic, toc
 
 # Load data
-dataset = Cora(transforms=[LayerPreprocess(GraphAttention), AdjToSpTensor()])
+dataset = Cora(transforms=[LayerPreprocess(GATConv), AdjToSpTensor()])
 graph = dataset[0]
 x, a, y = graph.x, graph.a, graph.y
 mask_tr, mask_va, mask_te = dataset.mask_tr, dataset.mask_va, dataset.mask_te
@@ -27,23 +27,23 @@
 x_in = Input(shape=(dataset.n_node_features,))
 a_in = Input(shape=(None,), sparse=True)
 x_1 = Dropout(0.6)(x_in)
-x_1 = GraphAttention(8,
-                     attn_heads=8,
-                     concat_heads=True,
-                     dropout_rate=0.6,
-                     activation='elu',
-                     kernel_regularizer=l2(5e-4),
-                     attn_kernel_regularizer=l2(5e-4),
-                     bias_regularizer=l2(5e-4))([x_1, a_in])
+x_1 = GATConv(8,
+              attn_heads=8,
+              concat_heads=True,
+              dropout_rate=0.6,
+              activation='elu',
+              kernel_regularizer=l2(5e-4),
+              attn_kernel_regularizer=l2(5e-4),
+              bias_regularizer=l2(5e-4))([x_1, a_in])
 x_2 = Dropout(0.6)(x_1)
-x_2 = GraphAttention(dataset.n_labels,
-                     attn_heads=1,
-                     concat_heads=True,
-                     dropout_rate=0.6,
-                     activation='softmax',
-                     kernel_regularizer=l2(5e-4),
-                     attn_kernel_regularizer=l2(5e-4),
-                     bias_regularizer=l2(5e-4))([x_2, a_in])
+x_2 = GATConv(dataset.n_labels,
+              attn_heads=1,
+              concat_heads=True,
+              dropout_rate=0.6,
+              activation='softmax',
+              kernel_regularizer=l2(5e-4),
+              attn_kernel_regularizer=l2(5e-4),
+              bias_regularizer=l2(5e-4))([x_2, a_in])
 
 # Build model
 model = Model(inputs=[x_in, a_in], outputs=x_2)
diff --git a/examples/node_prediction/citation_gcn.py b/examples/node_prediction/citation_gcn.py
index 05dc06b9..9813d48e 100644
--- a/examples/node_prediction/citation_gcn.py
+++ b/examples/node_prediction/citation_gcn.py
@@ -13,12 +13,12 @@
 
 from spektral.data.loaders import SingleLoader
 from spektral.datasets.citation import Citation
-from spektral.layers import GraphConv
+from spektral.layers import GCNConv
 from spektral.transforms import LayerPreprocess, AdjToSpTensor
 
 # Load data
 dataset = Citation('cora',
-                   transforms=[LayerPreprocess(GraphConv), AdjToSpTensor()])
+                   transforms=[LayerPreprocess(GCNConv), AdjToSpTensor()])
 mask_tr, mask_va, mask_te = dataset.mask_tr, dataset.mask_va, dataset.mask_te
 
 # Parameters
@@ -39,14 +39,14 @@
 a_in = Input((N,), sparse=True, dtype=a_dtype)
 
 do_1 = Dropout(dropout)(x_in)
-gc_1 = GraphConv(channels,
-                 activation='relu',
-                 kernel_regularizer=l2(l2_reg),
-                 use_bias=False)([do_1, a_in])
+gc_1 = GCNConv(channels,
+               activation='relu',
+               kernel_regularizer=l2(l2_reg),
+               use_bias=False)([do_1, a_in])
 do_2 = Dropout(dropout)(gc_1)
-gc_2 = GraphConv(n_out,
-                 activation='softmax',
-                 use_bias=False)([do_2, a_in])
+gc_2 = GCNConv(n_out,
+               activation='softmax',
+               use_bias=False)([do_2, a_in])
 
 # Build model
 model = Model(inputs=[x_in, a_in], outputs=gc_2)
diff --git a/examples/node_prediction/citation_gcn_fast.py b/examples/node_prediction/citation_gcn_fast.py
index ee6f8dfa..f33fb932 100644
--- a/examples/node_prediction/citation_gcn_fast.py
+++ b/examples/node_prediction/citation_gcn_fast.py
@@ -12,12 +12,12 @@
 from tensorflow.keras.regularizers import l2
 
 from spektral.datasets.citation import Cora
-from spektral.layers import GraphConv
+from spektral.layers import GCNConv
 from spektral.transforms import LayerPreprocess, AdjToSpTensor
 from spektral.utils import tic, toc
 
 # Load data
-dataset = Cora(transforms=[LayerPreprocess(GraphConv), AdjToSpTensor()])
+dataset = Cora(transforms=[LayerPreprocess(GCNConv), AdjToSpTensor()])
 graph = dataset[0]
 x, a, y = graph.x, graph.a, graph.y
 mask_tr, mask_va, mask_te = dataset.mask_tr, dataset.mask_va, dataset.mask_te
@@ -25,9 +25,9 @@
 # Define model
 x_in = Input(shape=(dataset.n_node_features,))
 a_in = Input((dataset.n_node_features,), sparse=True)
-x_1 = GraphConv(16, 'relu', True, kernel_regularizer=l2(5e-4))([x_in, a_in])
+x_1 = GCNConv(16, 'relu', True, kernel_regularizer=l2(5e-4))([x_in, a_in])
 x_1 = Dropout(0.5)(x_1)
-x_2 = GraphConv(y.shape[1], 'softmax', True)([x_1, a_in])
+x_2 = GCNConv(y.shape[1], 'softmax', True)([x_1, a_in])
 
 # Build model
 model = Model(inputs=[x_in, a_in], outputs=x_2)
diff --git a/examples/node_prediction/citation_simple_gc.py b/examples/node_prediction/citation_simple_gc.py
index c815bee2..843a1f90 100644
--- a/examples/node_prediction/citation_simple_gc.py
+++ b/examples/node_prediction/citation_simple_gc.py
@@ -17,7 +17,7 @@
 
 from spektral.data.loaders import SingleLoader
 from spektral.datasets.citation import Citation
-from spektral.layers import GraphConv
+from spektral.layers import GCNConv
 from spektral.transforms import LayerPreprocess, AdjToSpTensor
 
 
@@ -37,7 +37,7 @@ def __call__(self, graph):
 # Load data
 K = 2  # Propagation steps for SGCN
 dataset = Citation('cora',
-                   transforms=[LayerPreprocess(GraphConv), SGCN(K), AdjToSpTensor()])
+                   transforms=[LayerPreprocess(GCNConv), SGCN(K), AdjToSpTensor()])
 mask_tr, mask_va, mask_te = dataset.mask_tr, dataset.mask_va, dataset.mask_te
 
 # Parameters
@@ -55,10 +55,10 @@ def __call__(self, graph):
 x_in = Input(shape=(F,))
 a_in = Input((N,), sparse=True, dtype=a_dtype)
 
-output = GraphConv(n_out,
-                   activation='softmax',
-                   kernel_regularizer=l2(l2_reg),
-                   use_bias=False)([x_in, a_in])
+output = GCNConv(n_out,
+                 activation='softmax',
+                 kernel_regularizer=l2(l2_reg),
+                 use_bias=False)([x_in, a_in])
 
 # Build model
 model = Model(inputs=[x_in, a_in], outputs=output)
diff --git a/examples/node_prediction/ogbn-arxiv_gcn.py b/examples/node_prediction/ogbn-arxiv_gcn.py
index 1d8d3af5..9864b71c 100644
--- a/examples/node_prediction/ogbn-arxiv_gcn.py
+++ b/examples/node_prediction/ogbn-arxiv_gcn.py
@@ -12,7 +12,7 @@
 from tensorflow.keras.optimizers import Adam
 
 from spektral.datasets.ogb import OGB
-from spektral.layers import GraphConv
+from spektral.layers import GCNConv
 from spektral.transforms import GCNFilter, AdjToSpTensor
 
 # Load data
@@ -46,13 +46,13 @@
 # Model definition
 x_in = Input(shape=(F,))
 a_in = Input((N,), sparse=True)
-x_1 = GraphConv(channels, activation='relu')([x_in, a_in])
+x_1 = GCNConv(channels, activation='relu')([x_in, a_in])
 x_1 = BatchNormalization()(x_1)
 x_1 = Dropout(dropout)(x_1)
-x_2 = GraphConv(channels, activation='relu')([x_1, a_in])
+x_2 = GCNConv(channels, activation='relu')([x_1, a_in])
 x_2 = BatchNormalization()(x_2)
 x_2 = Dropout(dropout)(x_2)
-x_3 = GraphConv(n_out, activation='softmax')([x_2, a_in])
+x_3 = GCNConv(n_out, activation='softmax')([x_2, a_in])
 
 # Build model
 model = Model(inputs=[x_in, a_in], outputs=x_3)
diff --git a/examples/other/graph_signal_classification_mnist.py b/examples/other/graph_signal_classification_mnist.py
index 470f6661..31b4c098 100644
--- a/examples/other/graph_signal_classification_mnist.py
+++ b/examples/other/graph_signal_classification_mnist.py
@@ -5,7 +5,7 @@
 
 from spektral.data import PackedBatchLoader
 from spektral.datasets.mnist import MNIST
-from spektral.layers import GraphConv
+from spektral.layers import GCNConv
 from spektral.layers.ops import sp_matrix_to_sp_tensor
 
 # Parameters
@@ -20,7 +20,7 @@
 # The adjacency matrix is stored as an attribute of the dataset.
 # Create filter for GCN and convert to sparse tensor.
 adj = data.a
-adj = GraphConv.preprocess(adj)
+adj = GCNConv.preprocess(adj)
 adj = sp_matrix_to_sp_tensor(adj)
 
 # Train/valid/test split
@@ -33,8 +33,8 @@
 class Net(Model):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
-        self.conv1 = GraphConv(32, activation='elu', kernel_regularizer=l2(l2_reg))
-        self.conv2 = GraphConv(32, activation='elu', kernel_regularizer=l2(l2_reg))
+        self.conv1 = GCNConv(32, activation='elu', kernel_regularizer=l2(l2_reg))
+        self.conv2 = GCNConv(32, activation='elu', kernel_regularizer=l2(l2_reg))
         self.flatten = Flatten()
         self.fc1 = Dense(512, activation='relu')
         self.fc2 = Dense(10, activation='softmax')  # MNIST has 10 classes
diff --git a/examples/other/node_clustering_mincut.py b/examples/other/node_clustering_mincut.py
index 592b78a0..5adb9391 100644
--- a/examples/other/node_clustering_mincut.py
+++ b/examples/other/node_clustering_mincut.py
@@ -15,7 +15,7 @@
 from tqdm import tqdm
 
 from spektral.datasets.citation import Cora
-from spektral.layers.convolutional import GraphConvSkip
+from spektral.layers.convolutional import GCSConv
 from spektral.layers.ops import sp_matrix_to_sp_tensor
 from spektral.layers.pooling import MinCutPool
 from spektral.utils.convolution import normalized_adjacency
@@ -52,7 +52,7 @@ def train_step(inputs):
 x_in = Input(shape=(F,), name='X_in')
 a_in = Input(shape=(None,), name='A_in', sparse=True)
 
-x_1 = GraphConvSkip(16, activation='elu')([x_in, a_in])
+x_1 = GCSConv(16, activation='elu')([x_in, a_in])
 x_1, a_1, s_1 = MinCutPool(n_clusters, return_mask=True)([x_1, a_in])
 
 model = Model([x_in, a_in], [x_1, s_1])
diff --git a/spektral/layers/convolutional/__init__.py b/spektral/layers/convolutional/__init__.py
index ee43f846..929bf652 100644
--- a/spektral/layers/convolutional/__init__.py
+++ b/spektral/layers/convolutional/__init__.py
@@ -1,16 +1,16 @@
 from .agnn_conv import AGNNConv
-from .appnp import APPNP
+from .appnp_conv import APPNPConv
 from .arma_conv import ARMAConv
 from .cheb_conv import ChebConv
 from .crystal_conv import CrystalConv
 from .diffusion_conv import DiffusionConv
-from .ecc_conv import EdgeConditionedConv
+from .ecc_conv import ECCConv
 from .edge_conv import EdgeConv
 from .gated_graph_conv import GatedGraphConv
 from .gin_conv import GINConv
-from .graph_attention import GraphAttention
-from .graph_conv import GraphConv
-from .graph_conv_skip import GraphConvSkip
+from .gat_conv import GATConv
+from .gcn_conv import GCNConv
+from .gcs_conv import GCSConv
 from .graphsage_conv import GraphSageConv
 from .message_passing import MessagePassing
 from .tag_conv import TAGConv
diff --git a/spektral/layers/convolutional/appnp.py b/spektral/layers/convolutional/appnp_conv.py
similarity index 98%
rename from spektral/layers/convolutional/appnp.py
rename to spektral/layers/convolutional/appnp_conv.py
index b5cb4a38..1c104852 100644
--- a/spektral/layers/convolutional/appnp.py
+++ b/spektral/layers/convolutional/appnp_conv.py
@@ -3,10 +3,10 @@
 from tensorflow.keras.models import Sequential
 
 from spektral.layers import ops
-from spektral.layers.convolutional.graph_conv import GraphConv
+from spektral.layers.convolutional.gcn_conv import GCNConv
 
 
-class APPNP(GraphConv):
+class APPNPConv(GCNConv):
     r"""
     A graph convolutional layer implementing the APPNP operator, as presented by
     [Klicpera et al. (2019)](https://arxiv.org/abs/1810.05997).
diff --git a/spektral/layers/convolutional/arma_conv.py b/spektral/layers/convolutional/arma_conv.py
index 73c56a40..c9d1e0c5 100644
--- a/spektral/layers/convolutional/arma_conv.py
+++ b/spektral/layers/convolutional/arma_conv.py
@@ -2,11 +2,11 @@
 from tensorflow.keras.layers import Dropout
 
 from spektral.layers import ops
-from spektral.layers.convolutional.graph_conv import GraphConv
+from spektral.layers.convolutional.gcn_conv import GCNConv
 from spektral.utils import normalized_laplacian, rescale_laplacian
 
 
-class ARMAConv(GraphConv):
+class ARMAConv(GCNConv):
     r"""
     A graph convolutional layer with ARMA\(_K\) filters, as presented by
     [Bianchi et al. (2019)](https://arxiv.org/abs/1901.01343).
diff --git a/spektral/layers/convolutional/cheb_conv.py b/spektral/layers/convolutional/cheb_conv.py
index d4739f72..aa42f898 100644
--- a/spektral/layers/convolutional/cheb_conv.py
+++ b/spektral/layers/convolutional/cheb_conv.py
@@ -1,11 +1,11 @@
 from tensorflow.keras import backend as K
 
 from spektral.layers import ops
-from spektral.layers.convolutional.graph_conv import GraphConv
+from spektral.layers.convolutional.gcn_conv import GCNConv
 from spektral.utils import normalized_laplacian, rescale_laplacian
 
 
-class ChebConv(GraphConv):
+class ChebConv(GCNConv):
     r"""
     A Chebyshev convolutional layer as presented by
     [Defferrard et al. (2016)](https://arxiv.org/abs/1606.09375).
diff --git a/spektral/layers/convolutional/diffusion_conv.py b/spektral/layers/convolutional/diffusion_conv.py
index 477247fb..da8ae8a3 100644
--- a/spektral/layers/convolutional/diffusion_conv.py
+++ b/spektral/layers/convolutional/diffusion_conv.py
@@ -1,6 +1,6 @@
 import tensorflow as tf
 import tensorflow.keras.layers as layers
-from spektral.layers.convolutional.graph_conv import GraphConv
+from spektral.layers.convolutional.gcn_conv import GCNConv
 
 
 class DiffuseFeatures(layers.Layer):
@@ -81,7 +81,7 @@ def call(self, inputs):
         return tf.expand_dims(H, -1)
 
 
-class DiffusionConv(GraphConv):
+class DiffusionConv(GCNConv):
     r"""Applies Graph Diffusion Convolution as descibed by
     [Li et al. (2016)](https://arxiv.org/pdf/1707.01926.pdf)
 
diff --git a/spektral/layers/convolutional/ecc_conv.py b/spektral/layers/convolutional/ecc_conv.py
index 70ef70c3..40400394 100644
--- a/spektral/layers/convolutional/ecc_conv.py
+++ b/spektral/layers/convolutional/ecc_conv.py
@@ -4,10 +4,10 @@
 
 from spektral.layers import ops
 from spektral.layers.ops import modes
-from spektral.layers.convolutional.graph_conv import GraphConv
+from spektral.layers.convolutional.gcn_conv import GCNConv
 
 
-class EdgeConditionedConv(GraphConv):
+class ECCConv(GCNConv):
     r"""
     An edge-conditioned convolutional layer (ECC) as presented by
     [Simonovsky & Komodakis (2017)](https://arxiv.org/abs/1704.02901).
diff --git a/spektral/layers/convolutional/graph_attention.py b/spektral/layers/convolutional/gat_conv.py
similarity index 99%
rename from spektral/layers/convolutional/graph_attention.py
rename to spektral/layers/convolutional/gat_conv.py
index 748b595e..645affeb 100644
--- a/spektral/layers/convolutional/graph_attention.py
+++ b/spektral/layers/convolutional/gat_conv.py
@@ -4,11 +4,11 @@
 from tensorflow.keras.layers import Dropout
 
 from spektral.layers import ops
-from spektral.layers.convolutional.graph_conv import GraphConv
+from spektral.layers.convolutional.gcn_conv import GCNConv
 from spektral.layers.ops import modes
 
 
-class GraphAttention(GraphConv):
+class GATConv(GCNConv):
     r"""
     A graph attention layer (GAT) as presented by
     [Velickovic et al. (2017)](https://arxiv.org/abs/1710.10903).
diff --git a/spektral/layers/convolutional/graph_conv.py b/spektral/layers/convolutional/gcn_conv.py
similarity index 99%
rename from spektral/layers/convolutional/graph_conv.py
rename to spektral/layers/convolutional/gcn_conv.py
index 255284e0..23cc4dfa 100644
--- a/spektral/layers/convolutional/graph_conv.py
+++ b/spektral/layers/convolutional/gcn_conv.py
@@ -6,7 +6,7 @@
 from spektral.utils import gcn_filter
 
 
-class GraphConv(Layer):
+class GCNConv(Layer):
     r"""
     A graph convolutional layer (GCN) as presented by
     [Kipf & Welling (2016)](https://arxiv.org/abs/1609.02907).
diff --git a/spektral/layers/convolutional/graph_conv_skip.py b/spektral/layers/convolutional/gcs_conv.py
similarity index 97%
rename from spektral/layers/convolutional/graph_conv_skip.py
rename to spektral/layers/convolutional/gcs_conv.py
index fa40495b..1a2a1326 100644
--- a/spektral/layers/convolutional/graph_conv_skip.py
+++ b/spektral/layers/convolutional/gcs_conv.py
@@ -1,11 +1,11 @@
 from tensorflow.keras import backend as K
 
 from spektral.layers import ops
-from spektral.layers.convolutional.graph_conv import GraphConv
+from spektral.layers.convolutional.gcn_conv import GCNConv
 from spektral.utils import normalized_adjacency
 
 
-class GraphConvSkip(GraphConv):
+class GCSConv(GCNConv):
     r"""
     A simple convolutional layer with a skip connection.
 
diff --git a/spektral/layers/convolutional/graphsage_conv.py b/spektral/layers/convolutional/graphsage_conv.py
index 00700791..01837315 100644
--- a/spektral/layers/convolutional/graphsage_conv.py
+++ b/spektral/layers/convolutional/graphsage_conv.py
@@ -2,10 +2,10 @@
 from tensorflow.keras import backend as K
 
 from spektral.layers import ops
-from spektral.layers.convolutional.graph_conv import GraphConv
+from spektral.layers.convolutional.gcn_conv import GCNConv
 
 
-class GraphSageConv(GraphConv):
+class GraphSageConv(GCNConv):
     r"""
     A GraphSAGE layer as presented by
     [Hamilton et al. (2017)](https://arxiv.org/abs/1706.02216).
diff --git a/spektral/layers/pooling/global_pool.py b/spektral/layers/pooling/global_pool.py
index 7050eae5..a972323c 100644
--- a/spektral/layers/pooling/global_pool.py
+++ b/spektral/layers/pooling/global_pool.py
@@ -5,7 +5,7 @@
 from spektral.layers import ops
 
 
-class GlobalPooling(Layer):
+class GlobalPool(Layer):
     def __init__(self, **kwargs):
 
         super().__init__(**kwargs)
@@ -50,7 +50,7 @@ def get_config(self):
         return super().get_config()
 
 
-class GlobalSumPool(GlobalPooling):
+class GlobalSumPool(GlobalPool):
     """
     A global sum pooling layer. Pools a graph by computing the sum of its node
     features.
@@ -79,7 +79,7 @@ def __init__(self, **kwargs):
         self.batch_pooling_op = tf.reduce_sum
 
 
-class GlobalAvgPool(GlobalPooling):
+class GlobalAvgPool(GlobalPool):
     """
     An average pooling layer. Pools a graph by computing the average of its node
     features.
@@ -108,7 +108,7 @@ def __init__(self, **kwargs):
         self.batch_pooling_op = tf.reduce_mean
 
 
-class GlobalMaxPool(GlobalPooling):
+class GlobalMaxPool(GlobalPool):
     """
     A max pooling layer. Pools a graph by computing the maximum of its node
     features.
@@ -137,7 +137,7 @@ def __init__(self, **kwargs):
         self.batch_pooling_op = tf.reduce_max
 
 
-class GlobalAttentionPool(GlobalPooling):
+class GlobalAttentionPool(GlobalPool):
     r"""
     A gated attention global pooling layer as presented by
     [Li et al. (2017)](https://arxiv.org/abs/1511.05493).
@@ -249,7 +249,7 @@ def get_config(self):
         return dict(list(base_config.items()) + list(config.items()))
 
 
-class GlobalAttnSumPool(GlobalPooling):
+class GlobalAttnSumPool(GlobalPool):
     r"""
     A node-attention global pooling layer. Pools a graph by learning attention
     coefficients to sum node features.
diff --git a/tests/test_layers/test_convolutional.py b/tests/test_layers/test_convolutional.py
index 0bb52c9f..604e501d 100644
--- a/tests/test_layers/test_convolutional.py
+++ b/tests/test_layers/test_convolutional.py
@@ -2,8 +2,8 @@
 import tensorflow as tf
 from tensorflow.keras import Model, Input
 
-from spektral.layers import GraphConv, ChebConv, EdgeConditionedConv, GraphAttention, \
-    GraphConvSkip, ARMAConv, APPNP, GraphSageConv, GINConv, DiffusionConv, \
+from spektral.layers import GCNConv, ChebConv, ECCConv, GATConv, \
+    GCSConv, ARMAConv, APPNPConv, GraphSageConv, GINConv, DiffusionConv, \
     GatedGraphConv, AGNNConv, TAGConv, CrystalConv, MessagePassing, EdgeConv
 from spektral.layers.ops import sp_matrix_to_sp_tensor
 
@@ -55,7 +55,7 @@
 
 TESTS = [
     {
-        LAYER_K_: GraphConv,
+        LAYER_K_: GCNConv,
         MODES_K_: [SINGLE, BATCH, MIXED],
         KWARGS_K_: {'channels': 8, 'activation': 'relu', 'sparse': [False, True]},
     },
@@ -70,19 +70,19 @@
         KWARGS_K_: {'channels': 8, 'activation': 'relu', 'sparse': [False, True]}
     },
     {
-        LAYER_K_: EdgeConditionedConv,
+        LAYER_K_: ECCConv,
         MODES_K_: [SINGLE, BATCH],
         KWARGS_K_: {'kernel_network': [8], 'channels': 8, 'activation': 'relu',
                     'edges': True, 'sparse': [False, True]}
     },
     {
-        LAYER_K_: GraphAttention,
+        LAYER_K_: GATConv,
         MODES_K_: [SINGLE, BATCH, MIXED],
         KWARGS_K_: {'channels': 8, 'attn_heads': 2, 'concat_heads': False,
                     'activation': 'relu', 'sparse': [False, True]}
     },
     {
-        LAYER_K_: GraphConvSkip,
+        LAYER_K_: GCSConv,
         MODES_K_: [SINGLE, BATCH, MIXED],
         KWARGS_K_: {'channels': 8, 'activation': 'relu', 'sparse': [False, True]}
     },
@@ -93,7 +93,7 @@
                     'share_weights': True, 'sparse': [False, True]}
     },
     {
-        LAYER_K_: APPNP,
+        LAYER_K_: APPNPConv,
         MODES_K_: [SINGLE, BATCH, MIXED],
         KWARGS_K_: {'channels': 8, 'activation': 'relu', 'mlp_hidden': [16],
                     'sparse': [False, True]}
diff --git a/tests/test_transforms/test_transforms.py b/tests/test_transforms/test_transforms.py
index 73c7b4bd..de2b519b 100644
--- a/tests/test_transforms/test_transforms.py
+++ b/tests/test_transforms/test_transforms.py
@@ -58,8 +58,8 @@ def test_gcn_filter():
 
 
 def test_layer_preprocess():
-    from spektral.layers import GraphConv
-    t = LayerPreprocess(GraphConv)
+    from spektral.layers import GCNConv
+    t = LayerPreprocess(GCNConv)
     assert callable(t)
     g = Graph(x=x, a=a, e=e, y=y_nl)
     t(g)

From 694c40e00eec40f3c5b71e7bdcae1a2cfc947301 Mon Sep 17 00:00:00 2001
From: Daniele Grattarola <daniele.grattarola@gmail.com>
Date: Wed, 25 Nov 2020 14:35:55 +0100
Subject: [PATCH 42/57] Move GraphSageConv to MessagePassing interface Update
 naming conventions for variables to match PEP8 Add add_self_loops method for
 SparseTensors

---
 spektral/layers/convolutional/__init__.py     |   2 +-
 spektral/layers/convolutional/agnn_conv.py    |  16 +--
 spektral/layers/convolutional/appnp_conv.py   |   8 +-
 spektral/layers/convolutional/arma_conv.py    |  16 +--
 spektral/layers/convolutional/cheb_conv.py    |  18 +--
 spektral/layers/convolutional/crystal_conv.py |  14 +-
 .../layers/convolutional/diffusion_conv.py    | 125 ++++++------------
 spektral/layers/convolutional/ecc_conv.py     |  56 ++++----
 spektral/layers/convolutional/edge_conv.py    |   8 +-
 spektral/layers/convolutional/gat_conv.py     |  51 ++++---
 spektral/layers/convolutional/gcn_conv.py     |  13 +-
 spektral/layers/convolutional/gcs_conv.py     |  16 +--
 spektral/layers/convolutional/general_conv.py |  63 +++++++++
 spektral/layers/convolutional/gin_conv.py     |   2 +-
 .../layers/convolutional/graphsage_conv.py    |  52 ++------
 .../layers/convolutional/message_passing.py   |  60 ++++-----
 spektral/layers/convolutional/tag_conv.py     |  10 +-
 spektral/layers/ops/sparse.py                 |  38 +++++-
 tests/test_layers/test_convolutional.py       |   2 +-
 19 files changed, 298 insertions(+), 272 deletions(-)
 create mode 100644 spektral/layers/convolutional/general_conv.py

diff --git a/spektral/layers/convolutional/__init__.py b/spektral/layers/convolutional/__init__.py
index 929bf652..861d978a 100644
--- a/spektral/layers/convolutional/__init__.py
+++ b/spektral/layers/convolutional/__init__.py
@@ -12,5 +12,5 @@
 from .gcn_conv import GCNConv
 from .gcs_conv import GCSConv
 from .graphsage_conv import GraphSageConv
-from .message_passing import MessagePassing
 from .tag_conv import TAGConv
+from .message_passing import MessagePassing
\ No newline at end of file
diff --git a/spektral/layers/convolutional/agnn_conv.py b/spektral/layers/convolutional/agnn_conv.py
index 621d6528..70160d75 100644
--- a/spektral/layers/convolutional/agnn_conv.py
+++ b/spektral/layers/convolutional/agnn_conv.py
@@ -59,21 +59,21 @@ def build(self, input_shape):
 
     def call(self, inputs, **kwargs):
         x, a, _ = self.get_inputs(inputs)
-        X_norm = K.l2_normalize(x, axis=-1)
-        output = self.propagate(x, a, X_norm=X_norm)
+        x_norm = K.l2_normalize(x, axis=-1)
+        output = self.propagate(x, a, x_norm=x_norm)
         output = self.activation(output)
 
         return output
 
-    def message(self, X, X_norm=None):
-        X_j = self.get_j(X)
-        X_norm_i = self.get_i(X_norm)
-        X_norm_j = self.get_j(X_norm)
-        alpha = self.beta * tf.reduce_sum(X_norm_i * X_norm_j, axis=-1)
+    def message(self, x, x_norm=None):
+        x_j = self.get_j(x)
+        x_norm_i = self.get_i(x_norm)
+        x_norm_j = self.get_j(x_norm)
+        alpha = self.beta * tf.reduce_sum(x_norm_i * x_norm_j, axis=-1)
         alpha = ops.unsorted_segment_softmax(alpha, self.index_i, self.N)
         alpha = alpha[:, None]
 
-        return alpha * X_j
+        return alpha * x_j
 
     def get_config(self):
         config = {
diff --git a/spektral/layers/convolutional/appnp_conv.py b/spektral/layers/convolutional/appnp_conv.py
index 1c104852..674be779 100644
--- a/spektral/layers/convolutional/appnp_conv.py
+++ b/spektral/layers/convolutional/appnp_conv.py
@@ -117,14 +117,14 @@ def call(self, inputs):
         mlp_out = self.mlp(features)
 
         # Propagation
-        Z = mlp_out
+        z = mlp_out
         for k in range(self.propagations):
-            Z = (1 - self.alpha) * ops.filter_dot(fltr, Z) + self.alpha * mlp_out
+            z = (1 - self.alpha) * ops.filter_dot(fltr, z) + self.alpha * mlp_out
 
         if self.activation is not None:
-            output = self.activation(Z)
+            output = self.activation(z)
         else:
-            output = Z
+            output = z
         return output
 
     def get_config(self):
diff --git a/spektral/layers/convolutional/arma_conv.py b/spektral/layers/convolutional/arma_conv.py
index c9d1e0c5..c8886c59 100644
--- a/spektral/layers/convolutional/arma_conv.py
+++ b/spektral/layers/convolutional/arma_conv.py
@@ -120,15 +120,15 @@ def build(self, input_shape):
         self.built = True
 
     def call(self, inputs):
-        features = inputs[0]
-        fltr = inputs[1]
+        x = inputs[0]
+        a = inputs[1]
 
         # Convolution
         output = []  # Stores the parallel filters
         for k in range(self.order):
-            output_k = features
+            output_k = x
             for i in range(self.iterations):
-                output_k = self.gcs([output_k, features, fltr], k, i)
+                output_k = self.gcs([output_k, x, a], k, i)
             output.append(output_k)
 
         # Average stacks
@@ -217,7 +217,7 @@ def get_config(self):
         return dict(list(base_config.items()) + list(config.items()))
 
     @staticmethod
-    def preprocess(A):
-        fltr = normalized_laplacian(A, symmetric=True)
-        fltr = rescale_laplacian(fltr, lmax=2)
-        return fltr
+    def preprocess(a):
+        a = normalized_laplacian(a, symmetric=True)
+        a = rescale_laplacian(a, lmax=2)
+        return a
diff --git a/spektral/layers/convolutional/cheb_conv.py b/spektral/layers/convolutional/cheb_conv.py
index aa42f898..efbe3548 100644
--- a/spektral/layers/convolutional/cheb_conv.py
+++ b/spektral/layers/convolutional/cheb_conv.py
@@ -102,19 +102,19 @@ def build(self, input_shape):
         self.built = True
 
     def call(self, inputs):
-        features = inputs[0]
-        laplacian = inputs[1]
+        x = inputs[0]
+        a = inputs[1]
 
         # Convolution
-        T_0 = features
+        T_0 = x
         output = ops.dot(T_0, self.kernel[0])
 
         if self.K > 1:
-            T_1 = ops.filter_dot(laplacian, features)
+            T_1 = ops.filter_dot(a, x)
             output += ops.dot(T_1, self.kernel[1])
 
         for k in range(2, self.K):
-            T_2 = 2 * ops.filter_dot(laplacian, T_1) - T_0
+            T_2 = 2 * ops.filter_dot(a, T_1) - T_0
             output += ops.dot(T_2, self.kernel[k])
             T_0, T_1 = T_1, T_2
 
@@ -132,7 +132,7 @@ def get_config(self):
         return dict(list(base_config.items()) + list(config.items()))
 
     @staticmethod
-    def preprocess(A):
-        L = normalized_laplacian(A)
-        L = rescale_laplacian(L)
-        return L
+    def preprocess(a):
+        a = normalized_laplacian(a)
+        a = rescale_laplacian(a)
+        return a
diff --git a/spektral/layers/convolutional/crystal_conv.py b/spektral/layers/convolutional/crystal_conv.py
index 3a1eca47..410874e2 100644
--- a/spektral/layers/convolutional/crystal_conv.py
+++ b/spektral/layers/convolutional/crystal_conv.py
@@ -90,16 +90,16 @@ def build(self, input_shape):
 
         self.built = True
 
-    def message(self, X, E=None):
-        X_i = self.get_i(X)
-        X_j = self.get_j(X)
-        Z = K.concatenate((X_i, X_j, E), axis=-1)
-        output = self.dense_s(Z) * self.dense_f(Z)
+    def message(self, x, e=None):
+        x_i = self.get_i(x)
+        x_j = self.get_j(x)
+        z = K.concatenate((x_i, x_j, e), axis=-1)
+        output = self.dense_s(z) * self.dense_f(z)
 
         return output
 
-    def update(self, embeddings, X=None):
-        return X + embeddings
+    def update(self, embeddings, x=None):
+        return x + embeddings
 
     def get_config(self):
         config = {
diff --git a/spektral/layers/convolutional/diffusion_conv.py b/spektral/layers/convolutional/diffusion_conv.py
index da8ae8a3..35d17144 100644
--- a/spektral/layers/convolutional/diffusion_conv.py
+++ b/spektral/layers/convolutional/diffusion_conv.py
@@ -4,10 +4,10 @@
 
 
 class DiffuseFeatures(layers.Layer):
-    r"""Utility layer calculating a single channel of the
-    diffusional convolution.
+    r"""
+    Utility layer calculating a single channel of the diffusional convolution.
 
-    Procedure is based on https://arxiv.org/abs/1707.01926
+    The procedure is based on [https://arxiv.org/abs/1707.01926](https://arxiv.org/abs/1707.01926)
 
     **Input**
 
@@ -28,49 +28,38 @@ class DiffuseFeatures(layers.Layer):
     - `kernel_constraint`: constraint applied to the kernel vectors;
     """
 
-    def __init__(
-        self,
-        num_diffusion_steps: int,
-        kernel_initializer,
-        kernel_regularizer,
-        kernel_constraint,
-        **kwargs
-    ):
-        super(DiffuseFeatures, self).__init__()
-
-        # number of diffusino steps (K in paper)
-        self.K = num_diffusion_steps
+    def __init__(self,
+                 num_diffusion_steps,
+                 kernel_initializer,
+                 kernel_regularizer,
+                 kernel_constraint,
+                 **kwargs):
+        super(DiffuseFeatures, self).__init__(**kwargs)
 
-        # get regularizer, initializer and constraint for kernel
+        self.K = num_diffusion_steps
         self.kernel_initializer = kernel_initializer
         self.kernel_regularizer = kernel_regularizer
         self.kernel_constraint = kernel_constraint
 
     def build(self, input_shape):
-
-        # Initializing the kernel vector (R^K)
-        # (theta in paper)
-        self.kernel = self.add_weight(
-            shape=(self.K,),
-            name="kernel",
-            initializer=self.kernel_initializer,
-            regularizer=self.kernel_regularizer,
-            constraint=self.kernel_constraint,
-        )
+        # Initializing the kernel vector (R^K) (theta in paper)
+        self.kernel = self.add_weight(shape=(self.K,),
+                                      name="kernel",
+                                      initializer=self.kernel_initializer,
+                                      regularizer=self.kernel_regularizer,
+                                      constraint=self.kernel_constraint)
 
     def call(self, inputs):
-
-        # Get signal X and adjacency A
-        X, A = inputs
+        x, a = inputs
 
         # Calculate diffusion matrix: sum kernel_k * Attention_t^k
         # tf.polyval needs a list of tensors as the coeff. thus we
         # unstack kernel
-        diffusion_matrix = tf.math.polyval(tf.unstack(self.kernel), A)
+        diffusion_matrix = tf.math.polyval(tf.unstack(self.kernel), a)
 
         # Apply it to X to get a matrix C = [C_1, ..., C_F] (n_nodes x n_node_features)
         # of diffused features
-        diffused_features = tf.matmul(diffusion_matrix, X)
+        diffused_features = tf.matmul(diffusion_matrix, x)
 
         # Now we add all diffused features (columns of the above matrix)
         # and apply a non linearity to obtain H:,q (eq. 3 in paper)
@@ -91,6 +80,7 @@ class DiffusionConv(GCNConv):
 
     Given a number of diffusion steps \(K\) and a row normalized adjacency matrix \(\hat \A \),
     this layer calculates the q'th channel as:
+
     $$
     \mathbf{H}_{~:,~q} = \sigma\left(
         \sum_{f=1}^{n_node_features}
@@ -122,16 +112,14 @@ class DiffusionConv(GCNConv):
     - `kernel_constraint`: constraint applied to the weights;
     """
 
-    def __init__(
-        self,
-        channels: int,
-        num_diffusion_steps: int = 6,
-        kernel_initializer='glorot_uniform',
-        kernel_regularizer=None,
-        kernel_constraint=None,
-        activation='tanh',
-        ** kwargs
-    ):
+    def __init__(self,
+                 channels,
+                 num_diffusion_steps=6,
+                 activation='tanh',
+                 kernel_initializer='glorot_uniform',
+                 kernel_regularizer=None,
+                 kernel_constraint=None,
+                 **kwargs):
         super().__init__(channels,
                          activation=activation,
                          kernel_initializer=kernel_initializer,
@@ -147,30 +135,14 @@ def __init__(
         self.K = num_diffusion_steps + 1
 
     def build(self, input_shape):
-
-        # We expect to receive (X, A)
-        # A - Adjacency ([batch], n_nodes, n_nodes)
-        # X - graph signal ([batch], n_nodes, n_node_features)
-        X_shape, A_shape = input_shape
-
-        # initialise Q diffusion convolution filters
-        self.filters = []
-
-        for _ in range(self.Q):
-            layer = DiffuseFeatures(
-                num_diffusion_steps=self.K,
-                kernel_initializer=self.kernel_initializer,
-                kernel_regularizer=self.kernel_regularizer,
-                kernel_constraint=self.kernel_constraint,
-            )
-            self.filters.append(layer)
-
-    def apply_filters(self, X, A):
-        """Applies diffusion convolution self.Q times to get a
-        ([batch], n_nodes, Q) diffused graph signal
-
-        """
-
+        self.filters = [
+            DiffuseFeatures(num_diffusion_steps=self.K,
+                            kernel_initializer=self.kernel_initializer,
+                            kernel_regularizer=self.kernel_regularizer,
+                            kernel_constraint=self.kernel_constraint)
+            for _ in range(self.Q)]
+
+    def apply_filters(self, x, a):
         # This will be a list of Q diffused features.
         # Each diffused feature is a (batch, n_nodes, 1) tensor.
         # Later we will concat all the features to get one
@@ -179,25 +151,14 @@ def apply_filters(self, X, A):
 
         # Iterating over all Q diffusion filters
         for diffusion in self.filters:
-            diffused_feature = diffusion((X, A))
+            diffused_feature = diffusion((x, a))
             diffused_features.append(diffused_feature)
 
-        # Concat them into ([batch], n_nodes, Q) diffused graph signal
-        H = tf.concat(diffused_features, -1)
-
-        return H
+        return tf.concat(diffused_features, -1)
 
     def call(self, inputs):
+        x, a = inputs
+        h = self.apply_filters(x, a)
+        h = self.activation(h)
 
-        # Get graph signal X and adjacency tensor A
-        X, A = inputs
-
-        # 'single', 'batch' and 'mixed' mode are supported by
-        # default, since we access the dimensions from the end
-        # and everything else is broadcasted accordingly
-        # if its missing.
-
-        H = self.apply_filters(X, A)
-        H = self.activation(H)
-
-        return H
+        return h
diff --git a/spektral/layers/convolutional/ecc_conv.py b/spektral/layers/convolutional/ecc_conv.py
index 40400394..4b5925e0 100644
--- a/spektral/layers/convolutional/ecc_conv.py
+++ b/spektral/layers/convolutional/ecc_conv.py
@@ -124,32 +124,30 @@ def build(self, input_shape):
         self.built = True
 
     def call(self, inputs):
-        X = inputs[0]  # (batch_size, n_nodes, n_node_features)
-        A = inputs[1]  # (batch_size, n_nodes, n_nodes)
-        E = inputs[2]  # (n_edges, n_edge_features) or (batch_size, n_nodes, n_nodes, n_edge_features)
+        x, a, e = inputs
 
-        mode = ops.autodetect_mode(A, X)
+        mode = ops.autodetect_mode(a, x)
         if mode == modes.SINGLE:
             return self._call_single(inputs)
 
         # Parameters
-        N = K.shape(X)[-2]
-        F = K.int_shape(X)[-1]
+        N = K.shape(x)[-2]
+        F = K.int_shape(x)[-1]
         F_ = self.channels
 
         # Filter network
-        kernel_network = E
-        for l in self.kernel_network_layers:
-            kernel_network = l(kernel_network)
+        kernel_network = e
+        for layer in self.kernel_network_layers:
+            kernel_network = layer(kernel_network)
 
         # Convolution
         target_shape = (-1, N, N, F_, F) if mode == modes.BATCH else (N, N, F_, F)
         kernel = K.reshape(kernel_network, target_shape)
-        output = kernel * A[..., None, None]
-        output = tf.einsum('abicf,aif->abc', output, X)
+        output = kernel * a[..., None, None]
+        output = tf.einsum('abicf,aif->abc', output, x)
 
         if self.root:
-            output += ops.dot(X, self.root_kernel)
+            output += ops.dot(x, self.root_kernel)
         if self.use_bias:
             output = K.bias_add(output, self.bias)
         if self.activation is not None:
@@ -158,38 +156,38 @@ def call(self, inputs):
         return output
 
     def _call_single(self, inputs):
-        X = inputs[0]  # (n_nodes, F)
-        A = inputs[1]  # (n_nodes, n_nodes)
-        E = inputs[2]  # (n_edges, n_edge_features)
-        assert K.ndim(E) == 2, 'In single mode, E must have shape (n_edges, n_edge_features).'
+        x, a, e = inputs
+        if K.ndim(e) != 2:
+            raise ValueError('In single mode, E must have shape '
+                             '(n_edges, n_edge_features).')
 
         # Enforce sparse representation
-        if not K.is_sparse(A):
-            A = ops.dense_to_sparse(A)
+        if not K.is_sparse(a):
+            a = ops.dense_to_sparse(a)
 
         # Parameters
-        N = tf.shape(X)[-2]
-        F = K.int_shape(X)[-1]
+        N = tf.shape(x)[-2]
+        F = K.int_shape(x)[-1]
         F_ = self.channels
 
         # Filter network
-        kernel_network = E
-        for l in self.kernel_network_layers:
-            kernel_network = l(kernel_network)  # (n_edges, F * F_)
+        kernel_network = e
+        for layer in self.kernel_network_layers:
+            kernel_network = layer(kernel_network)  # (n_edges, F * F_)
         target_shape = (-1, F, F_)
         kernel = tf.reshape(kernel_network, target_shape)
 
         # Propagation
-        index_i = A.indices[:, -2]
-        index_j = A.indices[:, -1]
-        messages = tf.gather(X, index_j)
+        index_i = a.indices[:, -2]
+        index_j = a.indices[:, -1]
+        messages = tf.gather(x, index_j)
         messages = ops.dot(messages[:, None, :], kernel)[:, 0, :]
         aggregated = ops.scatter_sum(messages, index_i, N)
 
         # Update
         output = aggregated
         if self.root:
-            output += ops.dot(X, self.root_kernel)
+            output += ops.dot(x, self.root_kernel)
         if self.use_bias:
             output = K.bias_add(output, self.bias)
         if self.activation is not None:
@@ -206,5 +204,5 @@ def get_config(self):
         return dict(list(base_config.items()) + list(config.items()))
 
     @staticmethod
-    def preprocess(A):
-        return A
+    def preprocess(a):
+        return a
diff --git a/spektral/layers/convolutional/edge_conv.py b/spektral/layers/convolutional/edge_conv.py
index 6af51eaa..bc7d90e3 100644
--- a/spektral/layers/convolutional/edge_conv.py
+++ b/spektral/layers/convolutional/edge_conv.py
@@ -94,10 +94,10 @@ def build(self, input_shape):
 
         self.built = True
 
-    def message(self, X, **kwargs):
-        X_i = self.get_i(X)
-        X_j = self.get_j(X)
-        return self.mlp(K.concatenate((X_i, X_j - X_i)))
+    def message(self, x, **kwargs):
+        x_i = self.get_i(x)
+        x_j = self.get_j(x)
+        return self.mlp(K.concatenate((x_i, x_j - x_i)))
 
     def get_config(self):
         config = {
diff --git a/spektral/layers/convolutional/gat_conv.py b/spektral/layers/convolutional/gat_conv.py
index 645affeb..02895267 100644
--- a/spektral/layers/convolutional/gat_conv.py
+++ b/spektral/layers/convolutional/gat_conv.py
@@ -166,14 +166,13 @@ def build(self, input_shape):
         self.built = True
 
     def call(self, inputs):
-        X = inputs[0]
-        A = inputs[1]
+        x, a = inputs
 
-        mode = ops.autodetect_mode(A, X)
-        if mode == modes.SINGLE and K.is_sparse(A):
-            output, attn_coef = self._call_single(X, A)
+        mode = ops.autodetect_mode(a, x)
+        if mode == modes.SINGLE and K.is_sparse(a):
+            output, attn_coef = self._call_single(x, a)
         else:
-            output, attn_coef = self._call_dense(X, A)
+            output, attn_coef = self._call_dense(x, a)
 
         if self.concat_heads:
             shape = output.shape[:-2] + [self.attn_heads * self.channels]
@@ -192,26 +191,26 @@ def call(self, inputs):
         else:
             return output
 
-    def _call_single(self, X, A):
+    def _call_single(self, x, a):
         # Reshape kernels for efficient message-passing
         kernel = tf.reshape(self.kernel, (-1, self.attn_heads * self.channels))
         attn_kernel_self = ops.transpose(self.attn_kernel_self, (2, 1, 0))
         attn_kernel_neighs = ops.transpose(self.attn_kernel_neighs, (2, 1, 0))
 
         # Prepare message-passing
-        indices = A.indices
-        N = tf.shape(X, out_type=indices.dtype)[0]
-        indices = ops.sparse_add_self_loops(indices, N)
+        indices = a.indices
+        N = tf.shape(x, out_type=indices.dtype)[0]
+        indices = ops.add_self_loops_indices(indices, N)
         targets, sources = indices[:, -2], indices[:, -1]
 
         # Update node features
-        X = ops.dot(X, kernel)
-        X = tf.reshape(X, (-1, self.attn_heads, self.channels))
+        x = ops.dot(x, kernel)
+        x = tf.reshape(x, (-1, self.attn_heads, self.channels))
 
         # Compute attention
-        attn_for_self = tf.reduce_sum(X * attn_kernel_self, -1)
+        attn_for_self = tf.reduce_sum(x * attn_kernel_self, -1)
         attn_for_self = tf.gather(attn_for_self, targets)
-        attn_for_neighs = tf.reduce_sum(X * attn_kernel_neighs, -1)
+        attn_for_neighs = tf.reduce_sum(x * attn_kernel_neighs, -1)
         attn_for_neighs = tf.gather(attn_for_neighs, sources)
 
         attn_coef = attn_for_self + attn_for_neighs
@@ -221,29 +220,29 @@ def _call_single(self, X, A):
         attn_coef = attn_coef[..., None]
 
         # Update representation
-        output = attn_coef * tf.gather(X, sources)
+        output = attn_coef * tf.gather(x, sources)
         output = ops.scatter_sum(output, targets, N)
 
         return output, attn_coef
 
-    def _call_dense(self, X, A):
-        shape = tf.shape(A)[:-1]
-        A = tf.linalg.set_diag(A, tf.zeros(shape, A.dtype))
-        A = tf.linalg.set_diag(A, tf.ones(shape, A.dtype))
-        X = tf.einsum("...NI , IHO -> ...NHO", X, self.kernel)
-        attn_for_self = tf.einsum("...NHI , IHO -> ...NHO", X, self.attn_kernel_self)
-        attn_for_neighs = tf.einsum("...NHI , IHO -> ...NHO", X, self.attn_kernel_neighs)
+    def _call_dense(self, x, a):
+        shape = tf.shape(a)[:-1]
+        a = tf.linalg.set_diag(a, tf.zeros(shape, a.dtype))
+        a = tf.linalg.set_diag(a, tf.ones(shape, a.dtype))
+        x = tf.einsum("...NI , IHO -> ...NHO", x, self.kernel)
+        attn_for_self = tf.einsum("...NHI , IHO -> ...NHO", x, self.attn_kernel_self)
+        attn_for_neighs = tf.einsum("...NHI , IHO -> ...NHO", x, self.attn_kernel_neighs)
         attn_for_neighs = tf.einsum("...ABC -> ...CBA", attn_for_neighs)
 
         attn_coef = attn_for_self + attn_for_neighs
         attn_coef = tf.nn.leaky_relu(attn_coef, alpha=0.2)
 
-        mask = -10e9 * (1.0 - A)
+        mask = -10e9 * (1.0 - a)
         attn_coef += mask[..., None, :]
         attn_coef = tf.nn.softmax(attn_coef, axis=-1)
         attn_coef_drop = self.dropout(attn_coef)
 
-        output = tf.einsum("...NHM , ...MHI -> ...NHI", attn_coef_drop, X)
+        output = tf.einsum("...NHM , ...MHI -> ...NHI", attn_coef_drop, x)
 
         return output, attn_coef
 
@@ -265,5 +264,5 @@ def get_config(self):
         return dict(list(base_config.items()) + list(config.items()))
 
     @staticmethod
-    def preprocess(A):
-        return A
+    def preprocess(a):
+        return a
diff --git a/spektral/layers/convolutional/gcn_conv.py b/spektral/layers/convolutional/gcn_conv.py
index 23cc4dfa..4be487d5 100644
--- a/spektral/layers/convolutional/gcn_conv.py
+++ b/spektral/layers/convolutional/gcn_conv.py
@@ -89,12 +89,9 @@ def build(self, input_shape):
         self.built = True
 
     def call(self, inputs):
-        features = inputs[0]
-        fltr = inputs[1]
-
-        # Convolution
-        output = ops.dot(features, self.kernel)
-        output = ops.filter_dot(fltr, output)
+        x, a = inputs
+        output = ops.dot(x, self.kernel)
+        output = ops.filter_dot(a, output)
 
         if self.use_bias:
             output = K.bias_add(output, self.bias)
@@ -123,5 +120,5 @@ def get_config(self):
         return dict(list(base_config.items()) + list(config.items()))
 
     @staticmethod
-    def preprocess(A):
-        return gcn_filter(A)
\ No newline at end of file
+    def preprocess(a):
+        return gcn_filter(a)
diff --git a/spektral/layers/convolutional/gcs_conv.py b/spektral/layers/convolutional/gcs_conv.py
index 1a2a1326..a8c83a49 100644
--- a/spektral/layers/convolutional/gcs_conv.py
+++ b/spektral/layers/convolutional/gcs_conv.py
@@ -92,15 +92,11 @@ def build(self, input_shape):
         self.built = True
 
     def call(self, inputs):
-        features = inputs[0]
-        fltr = inputs[1]
+        x, a = inputs
 
-        # Convolution
-        output = K.dot(features, self.kernel_1)
-        output = ops.filter_dot(fltr, output)
-
-        # Skip connection
-        skip = K.dot(features, self.kernel_2)
+        output = K.dot(x, self.kernel_1)
+        output = ops.filter_dot(a, output)
+        skip = K.dot(x, self.kernel_2)
         output += skip
 
         if self.use_bias:
@@ -110,5 +106,5 @@ def call(self, inputs):
         return output
 
     @staticmethod
-    def preprocess(A):
-        return normalized_adjacency(A)
+    def preprocess(a):
+        return normalized_adjacency(a)
diff --git a/spektral/layers/convolutional/general_conv.py b/spektral/layers/convolutional/general_conv.py
new file mode 100644
index 00000000..051a8287
--- /dev/null
+++ b/spektral/layers/convolutional/general_conv.py
@@ -0,0 +1,63 @@
+from spektral.layers import MessagePassing
+
+
+class GeneralConv(MessagePassing):
+    def __init__(self,
+                 channels,
+                 batch_norm=True,
+                 dropout=0.0,
+                 aggregate='sum',
+                 activation=None,
+                 use_bias=True,
+                 kernel_initializer='glorot_uniform',
+                 bias_initializer='zeros',
+                 kernel_regularizer=None,
+                 bias_regularizer=None,
+                 activity_regularizer=None,
+                 kernel_constraint=None,
+                 bias_constraint=None,
+                 **kwargs):
+        super().__init__(aggregate=aggregate,
+                         activation=activation,
+                         use_bias=use_bias,
+                         kernel_initializer=kernel_initializer,
+                         bias_initializer=bias_initializer,
+                         kernel_regularizer=kernel_regularizer,
+                         bias_regularizer=bias_regularizer,
+                         activity_regularizer=activity_regularizer,
+                         kernel_constraint=kernel_constraint,
+                         bias_constraint=bias_constraint,
+                         **kwargs)
+        self.channels = self.output_dim = channels
+
+    def build(self, input_shape):
+        input_dim = input_shape[0][-1]
+        self.kernel = self.add_weight(shape=(input_dim, self.channels),
+                                      initializer=self.kernel_initializer,
+                                      name='kernel',
+                                      regularizer=self.kernel_regularizer,
+                                      constraint=self.kernel_constraint)
+        if self.use_bias:
+            self.bias = self.add_weight(shape=(self.channels,),
+                                        initializer=self.bias_initializer,
+                                        name='bias',
+                                        regularizer=self.bias_regularizer,
+                                        constraint=self.bias_constraint)
+        self.built = True
+
+    def call(self, inputs, **kwargs):
+        x, a, _ = self.get_inputs(inputs)
+
+
+
+    def message(self, x, **kwargs):
+        pass
+
+
+    def get_config(self):
+        config = {
+            'channels': self.channels,
+        }
+        base_config = super().get_config()
+        base_config.pop('aggregate')  # Remove it because it's defined by constructor
+        return {**base_config, **config}
\ No newline at end of file
diff --git a/spektral/layers/convolutional/gin_conv.py b/spektral/layers/convolutional/gin_conv.py
index fb27a769..546d110e 100644
--- a/spektral/layers/convolutional/gin_conv.py
+++ b/spektral/layers/convolutional/gin_conv.py
@@ -108,7 +108,7 @@ def build(self, input_shape):
 
         self.built = True
 
-    def call(self, inputs):
+    def call(self, inputs, **kwargs):
         x, a, _ = self.get_inputs(inputs)
         output = self.mlp((1.0 + self.eps) * x + self.propagate(x, a))
 
diff --git a/spektral/layers/convolutional/graphsage_conv.py b/spektral/layers/convolutional/graphsage_conv.py
index 01837315..5ddc360e 100644
--- a/spektral/layers/convolutional/graphsage_conv.py
+++ b/spektral/layers/convolutional/graphsage_conv.py
@@ -1,11 +1,10 @@
-import tensorflow as tf
 from tensorflow.keras import backend as K
 
 from spektral.layers import ops
-from spektral.layers.convolutional.gcn_conv import GCNConv
+from spektral.layers.convolutional.message_passing import MessagePassing
 
 
-class GraphSageConv(GCNConv):
+class GraphSageConv(MessagePassing):
     r"""
     A GraphSAGE layer as presented by
     [Hamilton et al. (2017)](https://arxiv.org/abs/1706.02216).
@@ -50,7 +49,7 @@ class GraphSageConv(GCNConv):
 
     def __init__(self,
                  channels,
-                 aggregate_op='mean',
+                 aggregate='mean',
                  activation=None,
                  use_bias=True,
                  kernel_initializer='glorot_uniform',
@@ -61,7 +60,7 @@ def __init__(self,
                  kernel_constraint=None,
                  bias_constraint=None,
                  **kwargs):
-        super().__init__(channels,
+        super().__init__(aggregate=aggregate,
                          activation=activation,
                          use_bias=use_bias,
                          kernel_initializer=kernel_initializer,
@@ -72,21 +71,7 @@ def __init__(self,
                          kernel_constraint=kernel_constraint,
                          bias_constraint=bias_constraint,
                          **kwargs)
-        if aggregate_op == 'sum':
-            self.aggregate_op = ops.scatter_sum
-        elif aggregate_op == 'mean':
-            self.aggregate_op = ops.scatter_mean
-        elif aggregate_op == 'max':
-            self.aggregate_op = ops.scatter_max
-        elif aggregate_op == 'min':
-            self.aggregate_op = ops.scatter_sum
-        elif aggregate_op == 'prod':
-            self.aggregate_op = ops.scatter_prod
-        elif callable(aggregate_op):
-            self.aggregate_op = aggregate_op
-        else:
-            raise ValueError('Possbile aggragation methods: sum, mean, max, min, '
-                             'prod')
+        self.channels = self.output_dim = channels
 
     def build(self, input_shape):
         assert len(input_shape) >= 2
@@ -107,21 +92,11 @@ def build(self, input_shape):
         self.built = True
 
     def call(self, inputs):
-        features = inputs[0]
-        fltr = inputs[1]
-
-        # Enforce sparse representation
-        if not K.is_sparse(fltr):
-            fltr = ops.dense_to_sparse(fltr)
-
-        # Propagation
-        indices = fltr.indices
-        N = tf.shape(features, out_type=indices.dtype)[0]
-        indices = ops.sparse_add_self_loops(indices, N)
-        targets, sources = indices[:, -2], indices[:, -1]
-        messages = tf.gather(features, sources)
-        aggregated = self.aggregate_op(messages, targets, N)
-        output = K.concatenate([features, aggregated])
+        x, a, _ = self.get_inputs(inputs)
+        a = ops.add_self_loops(a)
+
+        aggregated = self.propagate(x, a)
+        output = K.concatenate([x, aggregated])
         output = ops.dot(output, self.kernel)
 
         if self.use_bias:
@@ -129,15 +104,16 @@ def call(self, inputs):
         output = K.l2_normalize(output, axis=-1)
         if self.activation is not None:
             output = self.activation(output)
+
         return output
 
     def get_config(self):
         config = {
-            'aggregate_op': self.aggregate_op
+            'channels': self.channels
         }
         base_config = super().get_config()
         return dict(list(base_config.items()) + list(config.items()))
 
     @staticmethod
-    def preprocess(A):
-        return A
+    def preprocess(a):
+        return a
diff --git a/spektral/layers/convolutional/message_passing.py b/spektral/layers/convolutional/message_passing.py
index 1ce7124b..ef2b136c 100644
--- a/spektral/layers/convolutional/message_passing.py
+++ b/spektral/layers/convolutional/message_passing.py
@@ -33,7 +33,7 @@ class MessagePassing(Layer):
 
     **API:**
 
-    - `propagate(X, A, E=None, **kwargs)`: propagate the messages and computes
+    - `propagate(X, A, E=None, **kwargs)`: propagate the messages and compute
     embeddings for each node in the graph. `kwargs` will be propagated as
     keyword arguments to `message()`, `aggregate()` and `update()`.
     - `message(X, **kwargs)`: computes messages, equivalent to \(\phi\) in the
@@ -80,33 +80,33 @@ def __init__(self, aggregate='sum', **kwargs):
         self.agg = deserialize_scatter(aggregate)
 
     def call(self, inputs, **kwargs):
-        X, A, E = self.get_inputs(inputs)
-        return self.propagate(X, A, E)
+        x, a, e = self.get_inputs(inputs)
+        return self.propagate(x, a, e)
 
     def build(self, input_shape):
         self.built = True
 
-    def propagate(self, X, A, E=None, **kwargs):
-        self.N = tf.shape(X)[0]
-        self.index_i = A.indices[:, 0]
-        self.index_j = A.indices[:, 1]
+    def propagate(self, x, a, e=None, **kwargs):
+        self.N = tf.shape(x)[0]
+        self.index_i = a.indices[:, 1]
+        self.index_j = a.indices[:, 0]
 
         # Message
-        msg_kwargs = self.get_kwargs(X, A, E, self.msg_signature, kwargs)
-        messages = self.message(X, **msg_kwargs)
+        msg_kwargs = self.get_kwargs(x, a, e, self.msg_signature, kwargs)
+        messages = self.message(x, **msg_kwargs)
 
         # Aggregate
-        agg_kwargs = self.get_kwargs(X, A, E, self.agg_signature, kwargs)
+        agg_kwargs = self.get_kwargs(x, a, e, self.agg_signature, kwargs)
         embeddings = self.aggregate(messages, **agg_kwargs)
 
         # Update
-        upd_kwargs = self.get_kwargs(X, A, E, self.upd_signature, kwargs)
+        upd_kwargs = self.get_kwargs(x, a, e, self.upd_signature, kwargs)
         output = self.update(embeddings, **upd_kwargs)
 
         return output
 
-    def message(self, X, **kwargs):
-        return self.get_j(X)
+    def message(self, x, **kwargs):
+        return self.get_j(x)
 
     def aggregate(self, messages, **kwargs):
         return self.agg(messages, self.index_i, self.N)
@@ -120,17 +120,17 @@ def get_i(self, x):
     def get_j(self, x):
         return tf.gather(x, self.index_j)
 
-    def get_kwargs(self, X, A, E, signature, kwargs):
+    def get_kwargs(self, x, a, e, signature, kwargs):
         output = {}
         for k in signature.keys():
             if signature[k].default is inspect.Parameter.empty or k == 'kwargs':
                 pass
-            elif k == 'X':
-                output[k] = X
-            elif k == 'A':
-                output[k] = A
-            elif k == 'E':
-                output[k] = E
+            elif k == 'x':
+                output[k] = x
+            elif k == 'a':
+                output[k] = a
+            elif k == 'e':
+                output[k] = e
             elif k in kwargs:
                 output[k] = kwargs[k]
             else:
@@ -142,19 +142,19 @@ def get_kwargs(self, X, A, E, signature, kwargs):
     @staticmethod
     def get_inputs(inputs):
         if len(inputs) == 3:
-            X, A, E = inputs
-            assert K.ndim(E) == 2, 'E must have rank 2'
+            x, a, e = inputs
+            assert K.ndim(e) == 2, 'E must have rank 2'
         elif len(inputs) == 2:
-            X, A = inputs
-            E = None
+            x, a = inputs
+            e = None
         else:
             raise ValueError('Expected 2 or 3 inputs tensors (X, A, E), got {}.'
                              .format(len(inputs)))
-        assert K.ndim(X) == 2, 'X must have rank 2'
-        assert K.is_sparse(A), 'A must be a SparseTensor'
-        assert K.ndim(A) == 2, 'A must have rank 2'
+        assert K.ndim(x) == 2, 'X must have rank 2'
+        assert K.is_sparse(a), 'A must be a SparseTensor'
+        assert K.ndim(a) == 2, 'A must have rank 2'
 
-        return X, A, E
+        return x, a, e
 
     def compute_output_shape(self, input_shape):
         if self.output_dim:
@@ -173,5 +173,5 @@ def get_config(self):
         return {**base_config, **config}
 
     @staticmethod
-    def preprocess(A):
-        return A
+    def preprocess(a):
+        return a
diff --git a/spektral/layers/convolutional/tag_conv.py b/spektral/layers/convolutional/tag_conv.py
index c935f3a9..1fc2181c 100644
--- a/spektral/layers/convolutional/tag_conv.py
+++ b/spektral/layers/convolutional/tag_conv.py
@@ -97,9 +97,9 @@ def call(self, inputs, **kwargs):
 
         return self.linear(output)
 
-    def message(self, X, edge_weight=None):
-        X_j = self.get_j(X)
-        return edge_weight[:, None] * X_j
+    def message(self, x, edge_weight=None):
+        x_j = self.get_j(x)
+        return edge_weight[:, None] * x_j
 
     def get_config(self):
         config = {
@@ -110,5 +110,5 @@ def get_config(self):
         return {**base_config, **config}
 
     @staticmethod
-    def preprocess(A):
-        return normalized_adjacency(A)
+    def preprocess(a):
+        return normalized_adjacency(a)
diff --git a/spektral/layers/ops/sparse.py b/spektral/layers/ops/sparse.py
index a5e24c30..3e0b6690 100644
--- a/spektral/layers/ops/sparse.py
+++ b/spektral/layers/ops/sparse.py
@@ -55,7 +55,43 @@ def dense_to_sparse(x):
     return tf.SparseTensor(indices, values, shape)
 
 
-def sparse_add_self_loops(indices, N=None):
+def add_self_loops(a, fill=1.):
+    """
+    Adds self-loops to the given adjacency matrix. Self-loops are added only for
+    those node that don't have a self-loop already, and are assigned a weight
+    of `fill`.
+    :param a: a square SparseTensor.
+    :param fill: the fill value for the new self-loops. It will be cast to the
+    dtype of `a`.
+    :return: a SparseTensor with the same shape as the input.
+    """
+    N = tf.shape(a)[0]
+    indices = a.indices
+    values = a.values
+
+    mask_od = indices[:, 0] != indices[:, 1]
+    mask_sl = ~mask_od
+
+    indices_od = indices[mask_od]
+    indices_sl = indices[mask_sl]
+
+    values_sl = tf.fill((N, ), tf.cast(fill, values.dtype))
+    values_sl = tf.tensor_scatter_nd_update(
+        values_sl, indices_sl[:, 0:1], values[mask_sl])
+
+    indices_sl = tf.range(N, dtype=indices.dtype)[:, None]
+    indices_sl = tf.repeat(indices_sl, 2, -1)
+    indices = tf.concat((indices_od, indices_sl), 0)
+
+    values_od = values[mask_od]
+    values = tf.concat((values_od, values_sl), 0)
+
+    out = tf.SparseTensor(indices, values, (N, N))
+
+    return tf.sparse.reorder(out)
+
+
+def add_self_loops_indices(indices, N=None):
     """
     Given the indices of a square SparseTensor, adds the diagonal entries (i, i)
     and returns the reordered indices.
diff --git a/tests/test_layers/test_convolutional.py b/tests/test_layers/test_convolutional.py
index 604e501d..9eb51e55 100644
--- a/tests/test_layers/test_convolutional.py
+++ b/tests/test_layers/test_convolutional.py
@@ -67,7 +67,7 @@
     {
         LAYER_K_: GraphSageConv,
         MODES_K_: [SINGLE],
-        KWARGS_K_: {'channels': 8, 'activation': 'relu', 'sparse': [False, True]}
+        KWARGS_K_: {'channels': 8, 'activation': 'relu', 'sparse': [True]}
     },
     {
         LAYER_K_: ECCConv,

From d8c00dec5e9fbe018d6fa9f4209ef5601b5066d2 Mon Sep 17 00:00:00 2001
From: Daniele Grattarola <daniele.grattarola@gmail.com>
Date: Wed, 25 Nov 2020 18:47:00 +0100
Subject: [PATCH 43/57] Add models module Add GeneralGNN model Improve docs

---
 docs/autogen.py                               |  36 +--
 docs/mkdocs.yml                               |   1 +
 docs/templates/models.md                      |   5 +
 spektral/layers/convolutional/__init__.py     |   7 +-
 spektral/layers/convolutional/general_conv.py |  99 +++++++-
 .../layers/convolutional/graphsage_conv.py    |   2 +
 .../layers/convolutional/message_passing.py   |   2 +-
 spektral/layers/pooling/global_pool.py        |  18 ++
 spektral/models/__init__.py                   |   1 +
 spektral/models/general_gnn.py                | 218 ++++++++++++++++++
 tests/test_layers/test_convolutional.py       |  41 ++--
 11 files changed, 387 insertions(+), 43 deletions(-)
 create mode 100644 docs/templates/models.md
 create mode 100644 spektral/models/__init__.py
 create mode 100644 spektral/models/general_gnn.py

diff --git a/docs/autogen.py b/docs/autogen.py
index 390bc9ad..c3d9c2b8 100644
--- a/docs/autogen.py
+++ b/docs/autogen.py
@@ -11,6 +11,7 @@
 from spektral import data
 from spektral import datasets
 from spektral import layers
+from spektral import models
 from spektral import transforms
 from spektral import utils
 
@@ -36,22 +37,23 @@
     {
         'page': 'layers/convolution.md',
         'classes': [
-            layers.GCNConv,
-            layers.ChebConv,
-            layers.GraphSageConv,
+            layers.MessagePassing,
+            layers.AGNNConv,
+            layers.APPNPConv,
             layers.ARMAConv,
+            layers.ChebConv,
+            layers.CrystalConv,
+            layers.DiffusionConv,
             layers.ECCConv,
+            layers.EdgeConv,
             layers.GATConv,
+            layers.GatedGraphConv,
+            layers.GCNConv,
+            layers.GeneralConv,
             layers.GCSConv,
-            layers.APPNPConv,
             layers.GINConv,
-            layers.DiffusionConv,
-            layers.GatedGraphConv,
-            layers.AGNNConv,
+            layers.GraphSageConv,
             layers.TAGConv,
-            layers.CrystalConv,
-            layers.EdgeConv,
-            layers.MessagePassing,
         ]
     },
     {
@@ -61,11 +63,11 @@
         'classes': [
             layers.DiffPool,
             layers.MinCutPool,
-            layers.TopKPool,
             layers.SAGPool,
-            layers.GlobalSumPool,
+            layers.TopKPool,
             layers.GlobalAvgPool,
             layers.GlobalMaxPool,
+            layers.GlobalSumPool,
             layers.GlobalAttentionPool,
             layers.GlobalAttnSumPool,
             layers.SortPool
@@ -77,8 +79,16 @@
         'methods': [],
         'classes': [
             layers.InnerProduct,
-            layers.MinkowskiProduct,
             layers.Disjoint2Batch,
+            layers.MinkowskiProduct,
+        ]
+    },
+    {
+        'page': 'models.md',
+        'functions': [],
+        'methods': [],
+        'classes': [
+            models.GeneralGNN
         ]
     },
     # Data #####################################################################
diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml
index 2f1de14d..729a6afb 100644
--- a/docs/mkdocs.yml
+++ b/docs/mkdocs.yml
@@ -38,6 +38,7 @@ nav:
     - Convolutional Layers: layers/convolution.md
     - Pooling Layers: layers/pooling.md
     - Base Layers: layers/base.md
+    - Models: models.md
 - Data:
     - Containers: data.md
     - Datasets: datasets.md
diff --git a/docs/templates/models.md b/docs/templates/models.md
new file mode 100644
index 00000000..7eaa7809
--- /dev/null
+++ b/docs/templates/models.md
@@ -0,0 +1,5 @@
+# Models
+
+This module implements ready-to-use models from recent literature.
+
+{{autogenerated}}
\ No newline at end of file
diff --git a/spektral/layers/convolutional/__init__.py b/spektral/layers/convolutional/__init__.py
index 861d978a..16b061bb 100644
--- a/spektral/layers/convolutional/__init__.py
+++ b/spektral/layers/convolutional/__init__.py
@@ -6,11 +6,12 @@
 from .diffusion_conv import DiffusionConv
 from .ecc_conv import ECCConv
 from .edge_conv import EdgeConv
-from .gated_graph_conv import GatedGraphConv
-from .gin_conv import GINConv
 from .gat_conv import GATConv
+from .gated_graph_conv import GatedGraphConv
 from .gcn_conv import GCNConv
 from .gcs_conv import GCSConv
+from .general_conv import GeneralConv
+from .gin_conv import GINConv
 from .graphsage_conv import GraphSageConv
+from .message_passing import MessagePassing
 from .tag_conv import TAGConv
-from .message_passing import MessagePassing
\ No newline at end of file
diff --git a/spektral/layers/convolutional/general_conv.py b/spektral/layers/convolutional/general_conv.py
index 051a8287..af5e5bf3 100644
--- a/spektral/layers/convolutional/general_conv.py
+++ b/spektral/layers/convolutional/general_conv.py
@@ -1,13 +1,82 @@
-from spektral.layers import MessagePassing
+from spektral.layers.convolutional.message_passing import MessagePassing
+from tensorflow.keras.layers import Dropout, BatchNormalization, PReLU
+from spektral.layers.ops import dot
+import tensorflow as tf
+from tensorflow.keras import activations
 
 
 class GeneralConv(MessagePassing):
+    r"""
+    A general convolutional layer as described by
+    [You et al.](https://arxiv.org/abs/2011.08843).
+
+    **Mode**: single, disjoint.
+
+    **This layer expects a sparse adjacency matrix.**
+
+    This layer computes:
+
+    $$
+        \h_i = \mathrm{Agg} \left( \left\{ \mathrm{Act} \left( \mathrm{Dropout}
+        \left( \mathrm{BN} \left( \x_j \W + \b \right) \right) \right),
+        j \in \mathcal{N}(i) \right\} \right)
+    $$
+
+    where \( \mathrm{Agg} \) is an aggregation function for the messages,
+    \( \mathrm{Act} \) is an activation function, \( \mathrm{Dropout} \)
+    applies dropout to the node features, and \( \mathrm{BN} \) applies batch
+    normalization to the node features.
+
+    This layer supports the PReLU activation via the 'prelu' keyword.
+
+    The default parameters of this layer are selected according to the best
+    results obtained in the paper, and should provide a good performance on
+    many node-level and graph-level tasks, without modifications.
+    The defaults are as follows:
+
+    - 256 channels
+    - Batch normalization
+    - No dropout
+    - PReLU activation
+    - Sum aggregation
+
+    If you are uncertain about which layers to use for your GNN, this is a
+    safe choice. Check out the original paper for more specific configurations.
+
+    **Input**
+
+    - Node features of shape `(n_nodes, n_node_features)`;
+    - Binary adjacency matrix of shape `(n_nodes, n_nodes)`.
+
+    **Output**
+
+    - Node features with the same shape of the input, but the last dimension
+    changed to `channels`.
+
+    **Arguments**
+
+    - `channels`: integer, number of output channels;
+    - `batch_norm`: bool, whether to use batch normalization;
+    - `dropout`: float, dropout rate;
+    - `aggregate`: string or callable, an aggregation function. Supported
+    aggregations: 'sum', 'mean', 'max', 'min', 'prod'.
+    - `activation`: activation function to use. This layer also supports the
+    advanced activation PReLU by passing `activation='prelu'`.
+    - `use_bias`: bool, add a bias vector to the output;
+    - `kernel_initializer`: initializer for the weights;
+    - `bias_initializer`: initializer for the bias vector;
+    - `kernel_regularizer`: regularization applied to the weights;
+    - `bias_regularizer`: regularization applied to the bias vector;
+    - `activity_regularizer`: regularization applied to the output;
+    - `kernel_constraint`: constraint applied to the weights;
+    - `bias_constraint`: constraint applied to the bias vector.
+    """
     def __init__(self,
-                 channels,
+                 channels=256,
                  batch_norm=True,
                  dropout=0.0,
                  aggregate='sum',
-                 activation=None,
+                 activation='prelu',
                  use_bias=True,
                  kernel_initializer='glorot_uniform',
                  bias_initializer='zeros',
@@ -18,7 +87,7 @@ def __init__(self,
                  bias_constraint=None,
                  **kwargs):
         super().__init__(aggregate=aggregate,
-                         activation=activation,
+                         activation=None,
                          use_bias=use_bias,
                          kernel_initializer=kernel_initializer,
                          bias_initializer=bias_initializer,
@@ -29,9 +98,18 @@ def __init__(self,
                          bias_constraint=bias_constraint,
                          **kwargs)
         self.channels = self.output_dim = channels
+        self.dropout_rate = dropout
+        self.use_batch_norm = batch_norm
+        if activation == 'prelu':
+            self.activation = PReLU()
+        else:
+            self.activation = activations.get(activation)
 
     def build(self, input_shape):
         input_dim = input_shape[0][-1]
+        self.dropout = Dropout(self.dropout_rate)
+        if self.use_batch_norm:
+            self.batch_norm = BatchNormalization()
         self.kernel = self.add_weight(shape=(input_dim, self.channels),
                                       initializer=self.kernel_initializer,
                                       name='kernel',
@@ -48,11 +126,17 @@ def build(self, input_shape):
     def call(self, inputs, **kwargs):
         x, a, _ = self.get_inputs(inputs)
 
+        # TODO: a = add_self_loops(a)
 
+        x = dot(x, self.kernel)
+        if self.use_bias:
+            x = tf.nn.bias_add(x, self.bias)
+        if self.use_batch_norm:
+            x = self.batch_norm(x)
+        x = self.dropout(x)
+        x = self.activation(x)
 
-    def message(self, x, **kwargs):
-        pass
-
+        return self.propagate(x, a)
 
     def get_config(self):
         config = {
@@ -60,4 +144,5 @@ def get_config(self):
         }
         base_config = super().get_config()
         base_config.pop('aggregate')  # Remove it because it's defined by constructor
+        base_config['activation'] = 'prelu'
         return {**base_config, **config}
\ No newline at end of file
diff --git a/spektral/layers/convolutional/graphsage_conv.py b/spektral/layers/convolutional/graphsage_conv.py
index 5ddc360e..0469bf7f 100644
--- a/spektral/layers/convolutional/graphsage_conv.py
+++ b/spektral/layers/convolutional/graphsage_conv.py
@@ -11,6 +11,8 @@ class GraphSageConv(MessagePassing):
 
     **Mode**: single, disjoint.
 
+    **This layer expects a sparse adjacency matrix.**
+
     This layer computes:
     $$
         \Z = \big[ \textrm{AGGREGATE}(\X) \| \X \big] \W + \b; \\
diff --git a/spektral/layers/convolutional/message_passing.py b/spektral/layers/convolutional/message_passing.py
index ef2b136c..5605d7c5 100644
--- a/spektral/layers/convolutional/message_passing.py
+++ b/spektral/layers/convolutional/message_passing.py
@@ -57,7 +57,7 @@ class MessagePassing(Layer):
 
     **Arguments**:
 
-    - `aggregate`: string or callable, an aggregate function. This flag can be
+    - `aggregate`: string or callable, an aggregation function. This flag can be
     used to control the behaviour of `aggregate()` wihtout re-implementing it.
     Supported aggregations: 'sum', 'mean', 'max', 'min', 'prod'.
     If callable, the function must have the signature `foo(updates, indices, n_nodes)`
diff --git a/spektral/layers/pooling/global_pool.py b/spektral/layers/pooling/global_pool.py
index a972323c..e341908e 100644
--- a/spektral/layers/pooling/global_pool.py
+++ b/spektral/layers/pooling/global_pool.py
@@ -431,3 +431,21 @@ def compute_output_shape(self, input_shape):
             return self.k, self.F
         elif self.data_mode == 'batch' or self.data_mode == 'disjoint':
             return input_shape[0], self.k, self.F
+
+
+layers = {
+    'sum': GlobalSumPool,
+    'avg': GlobalAvgPool,
+    'max': GlobalMaxPool,
+    'attn': GlobalAttentionPool,
+    'attn_sum': GlobalAttnSumPool,
+    'sort': SortPool
+}
+
+
+def get(identifier):
+    if identifier not in layers:
+        raise ValueError('Unknown identifier {}. Available: {}'
+                         .format(identifier, list(layers.keys())))
+    else:
+        return layers[identifier]
\ No newline at end of file
diff --git a/spektral/models/__init__.py b/spektral/models/__init__.py
new file mode 100644
index 00000000..3796fb3f
--- /dev/null
+++ b/spektral/models/__init__.py
@@ -0,0 +1 @@
+from .general_gnn import GeneralGNN
\ No newline at end of file
diff --git a/spektral/models/general_gnn.py b/spektral/models/general_gnn.py
new file mode 100644
index 00000000..256dcde4
--- /dev/null
+++ b/spektral/models/general_gnn.py
@@ -0,0 +1,218 @@
+from tensorflow.keras import Model, Sequential
+from tensorflow.keras.layers import Dense, Concatenate, Add, BatchNormalization, Dropout, Activation
+from tensorflow.keras.layers import PReLU
+
+from spektral.data import DisjointLoader
+from spektral.datasets import TUDataset
+from spektral.layers import GeneralConv
+from spektral.layers.pooling import global_pool
+
+
+def get_act(identifier):
+    if identifier == 'prelu':
+        return PReLU()
+    else:
+        return Activation(identifier)
+
+
+class GeneralGNN(Model):
+    r"""
+    This model implements the GNN architecture described in
+
+    > [Design Space for Graph Neural Networks](https://arxiv.org/abs/2011.08843)<br>
+    > Jiaxuan You, Rex Ying, Jure Leskovec<br>
+    > NeurIPS 2020
+
+    The default parameters of the model are selected according to the best
+    results obtained in the paper, and should provide a good performance on
+    many node-level and graph-level tasks, without modifications.
+    The defaults are as follows:
+
+    - 256 hidden channels
+    - 4 message passing layers
+    - 2 pre-processing layers
+    - 2 post-processing layers
+    - Skip connections with concatenation
+    - Batch normalization
+    - No dropout
+    - PReLU activations
+    - Sum aggregation in the message-passing layers
+    - Global sum pooling (not from the paper)
+
+    The GNN uses the [`GeneralConv` layer](/layers/convolution/#generalconv)
+    for message passing, and has a pre- and a post-processing MLP for the node
+    features.
+    Message-passing layers also have optional skip connections, which can be
+    implemented as sum or concatenation.
+
+    The dense layers of the pre-processing and post-processing MLPs compute the
+    following update of the node features:
+
+    $$
+        \h_i = \mathrm{Act} \left( \mathrm{Dropout} \left( \mathrm{BN}
+        \left( \x_i \W + \b \right) \right) \right)
+    $$
+
+    Message-passing layers compute:
+
+    $$
+        \h_i = \mathrm{Agg} \left( \left\{ \mathrm{Act} \left( \mathrm{Dropout}
+        \left( \mathrm{BN} \left( \x_j \W + \b \right) \right) \right),
+        j \in \mathcal{N}(i) \right\} \right)
+    $$
+
+    **Arguments**
+
+    - `output`: int, the number of output units;
+    - `activation`: the activation function of the output layer.
+    - `hidden`: int, the number of hidden units for all layers except the output
+    one;
+    - `message_passing`: int, the nummber of message-passing layers;
+    - `pre_process`: int, the number of layers in the pre-processing MLP;
+    - `post_process`: int, the number of layers in the post-processing MLP;
+    - `connectivity`: the type of skip connection. Can be: None, 'sum' or 'cat';
+    - `batch_norm`: bool, whether to use batch normalization;
+    - `dropout`: float, dropout rate;
+    - `aggregate`: string or callable, an aggregation function. Supported
+    aggregations: 'sum', 'mean', 'max', 'min', 'prod'.
+    - `hidden_activation`: activation function in the hidden layers. The PReLU
+    activation can be used by passing `hidden_activation='prelu'`.
+    - `pool`: string or None, the global pooling function. If None, no global
+    pooling is applied (e.g., for node-level learning). Supported pooling methods:
+    'sum', 'avg', 'max', 'attn', 'attn_sum', 'sort'
+    (see `spektral.layers.pooling.global_pool`).
+    """
+    def __init__(self, output, activation=None, hidden=256, message_passing=4,
+                 pre_process=2, post_process=2, connectivity='cat',
+                 batch_norm=True, dropout=0.0, aggregate='sum',
+                 hidden_activation='prelu', pool='sum'):
+        super().__init__()
+
+        # Connectivity function
+        if connectivity is None:
+            self.connectivity = None
+        elif connectivity == 'sum':
+            self.connectivity = Add()
+        elif connectivity == 'cat':
+            self.connectivity = Concatenate()
+        else:
+            raise ValueError('Unknown connectivity: {}. Available: None, sum, cat.')
+
+        # Global pooling
+        if pool is not None:
+            self.pool = global_pool.get(pool)()
+        else:
+            self.pool = None
+
+        # Neural blocks
+        self.pre = MLP(hidden, hidden, pre_process, batch_norm, dropout,
+                       hidden_activation, hidden_activation)
+        self.gnn = [GeneralConv(hidden, batch_norm, dropout, aggregate, hidden_activation)
+                    for _ in range(message_passing)]
+        self.post = MLP(output, hidden, post_process, batch_norm, dropout, hidden_activation, activation)
+
+    def call(self, inputs):
+        if len(inputs) == 2:
+            x, a = inputs
+            i = None
+        else:
+            x, a, i = inputs
+
+        # Pre-process
+        out = self.pre(x)
+        # Message passing
+        for layer in self.gnn:
+            z = layer([out, a])
+            if self.connectivity is not None:
+                out = self.connectivity([z, out])
+            else:
+                out = z
+        # Global pooling
+        if self.pool is not None:
+            out = self.pool([out] + [i] if i is not None else [])
+        # Post-process
+        out = self.post(out)
+
+        return out
+
+
+class MLP(Model):
+    def __init__(self, output, hidden=256, layers=2, batch_norm=True,
+                 dropout=0.0, activation='prelu', final_activation=None):
+        super().__init__()
+        self.batch_norm = batch_norm
+        self.dropout_rate = dropout
+
+        self.mlp = Sequential()
+        for i in range(layers):
+            # Linear
+            self.mlp.add(Dense(hidden if i < layers - 1 else output))
+            # Batch norm
+            if self.batch_norm:
+                self.mlp.add(BatchNormalization())
+            # Dropout
+            self.mlp.add(Dropout(self.dropout_rate))
+            # Activation
+            self.mlp.add(get_act(activation if i < layers - 1 else final_activation))
+
+    def call(self, inputs):
+        return self.mlp(inputs)
+
+
+if __name__ == '__main__':
+    import tensorflow as tf
+    import numpy as np
+    from tensorflow.keras.optimizers import Adam
+    physical_devices = tf.config.list_physical_devices('GPU')
+    tf.config.experimental.set_memory_growth(physical_devices[0], True)
+
+    # Best config
+    batch_size = 32
+    learning_rate = 0.01
+    epochs = 400
+
+    # Read data
+    data = TUDataset('PROTEINS')
+
+    # Train/test split
+    np.random.shuffle(data)
+    split = int(0.8 * len(data))
+    data_tr, data_te = data[:split], data[split:]
+
+    # Data loader
+    loader_tr = DisjointLoader(data_tr, batch_size=batch_size, epochs=epochs)
+    loader_te = DisjointLoader(data_te, batch_size=batch_size)
+
+    # Create model
+    model = GeneralGNN(data.n_labels, activation='softmax')
+    optimizer = Adam(learning_rate)
+    model.compile('adam', 'categorical_crossentropy', metrics=['categorical_accuracy'])
+
+    # Evaluation function
+    def eval(loader):
+        step = 0
+        results = []
+        for batch in loader:
+            step += 1
+            l, a = model.test_on_batch(*batch)
+            results.append((l, a))
+            if step == loader.steps_per_epoch:
+                return np.mean(results, 0)
+
+    # Training loop
+    epoch = step = 0
+    results = []
+    for batch in loader_tr:
+        step += 1
+        l, a = model.train_on_batch(*batch)
+        results.append((l, a))
+        if step == loader_tr.steps_per_epoch:
+            step = 0
+            epoch += 1
+            results_te = eval(loader_te)
+            print('Epoch {} - Train loss: {:.3f} - Train acc: {:.3f} - '
+                  'Test loss: {:.3f} - Test acc: {:.3f}'
+                  .format(epoch, *np.mean(results, 0), *results_te))
+
+    results_te = eval(loader_te)
+    print('Final results - Loss: {:.3f} - Acc: {:.3f}'.format(*results_te))
diff --git a/tests/test_layers/test_convolutional.py b/tests/test_layers/test_convolutional.py
index 9eb51e55..92096c48 100644
--- a/tests/test_layers/test_convolutional.py
+++ b/tests/test_layers/test_convolutional.py
@@ -2,9 +2,7 @@
 import tensorflow as tf
 from tensorflow.keras import Model, Input
 
-from spektral.layers import GCNConv, ChebConv, ECCConv, GATConv, \
-    GCSConv, ARMAConv, APPNPConv, GraphSageConv, GINConv, DiffusionConv, \
-    GatedGraphConv, AGNNConv, TAGConv, CrystalConv, MessagePassing, EdgeConv
+from spektral import layers
 from spektral.layers.ops import sp_matrix_to_sp_tensor
 
 tf.keras.backend.set_floatx('float64')
@@ -55,89 +53,94 @@
 
 TESTS = [
     {
-        LAYER_K_: GCNConv,
+        LAYER_K_: layers.GCNConv,
         MODES_K_: [SINGLE, BATCH, MIXED],
         KWARGS_K_: {'channels': 8, 'activation': 'relu', 'sparse': [False, True]},
     },
     {
-        LAYER_K_: ChebConv,
+        LAYER_K_: layers.ChebConv,
         MODES_K_: [SINGLE, BATCH, MIXED],
         KWARGS_K_: {'K': 3, 'channels': 8, 'activation': 'relu', 'sparse': [False, True]}
     },
     {
-        LAYER_K_: GraphSageConv,
+        LAYER_K_: layers.GraphSageConv,
         MODES_K_: [SINGLE],
         KWARGS_K_: {'channels': 8, 'activation': 'relu', 'sparse': [True]}
     },
     {
-        LAYER_K_: ECCConv,
+        LAYER_K_: layers.ECCConv,
         MODES_K_: [SINGLE, BATCH],
         KWARGS_K_: {'kernel_network': [8], 'channels': 8, 'activation': 'relu',
                     'edges': True, 'sparse': [False, True]}
     },
     {
-        LAYER_K_: GATConv,
+        LAYER_K_: layers.GATConv,
         MODES_K_: [SINGLE, BATCH, MIXED],
         KWARGS_K_: {'channels': 8, 'attn_heads': 2, 'concat_heads': False,
                     'activation': 'relu', 'sparse': [False, True]}
     },
     {
-        LAYER_K_: GCSConv,
+        LAYER_K_: layers.GCSConv,
         MODES_K_: [SINGLE, BATCH, MIXED],
         KWARGS_K_: {'channels': 8, 'activation': 'relu', 'sparse': [False, True]}
     },
     {
-        LAYER_K_: ARMAConv,
+        LAYER_K_: layers.ARMAConv,
         MODES_K_: [SINGLE, BATCH, MIXED],
         KWARGS_K_: {'channels': 8, 'activation': 'relu', 'order': 2, 'iterations': 2,
                     'share_weights': True, 'sparse': [False, True]}
     },
     {
-        LAYER_K_: APPNPConv,
+        LAYER_K_: layers.APPNPConv,
         MODES_K_: [SINGLE, BATCH, MIXED],
         KWARGS_K_: {'channels': 8, 'activation': 'relu', 'mlp_hidden': [16],
                     'sparse': [False, True]}
     },
     {
-        LAYER_K_: GINConv,
+        LAYER_K_: layers.GINConv,
         MODES_K_: [SINGLE],
         KWARGS_K_: {'channels': 8, 'activation': 'relu', 'mlp_hidden': [16],
                     'sparse': [True]}
     },
     {
-        LAYER_K_: DiffusionConv,
+        LAYER_K_: layers.DiffusionConv,
         MODES_K_: [SINGLE, BATCH, MIXED],
         KWARGS_K_: {'channels': 8, 'activation': 'tanh', 'num_diffusion_steps': 5,
                     'sparse': [False]}
     },
     {
-        LAYER_K_: GatedGraphConv,
+        LAYER_K_: layers.GatedGraphConv,
         MODES_K_: [SINGLE],
         KWARGS_K_: {'channels': 10, 'n_layers': 3, 'sparse': [True]}
     },
     {
-        LAYER_K_: AGNNConv,
+        LAYER_K_: layers.AGNNConv,
         MODES_K_: [SINGLE],
         KWARGS_K_: {'channels': F, 'trainable': True, 'sparse': [True]}
     },
     {
-        LAYER_K_: TAGConv,
+        LAYER_K_: layers.TAGConv,
         MODES_K_: [SINGLE],
         KWARGS_K_: {'channels': F, 'K': 3, 'sparse': [True]}
     },
     {
-        LAYER_K_: CrystalConv,
+        LAYER_K_: layers.CrystalConv,
         MODES_K_: [SINGLE],
         KWARGS_K_: {'channels': F, 'edges': True, 'sparse': [True]}
     },
     {
-        LAYER_K_: EdgeConv,
+        LAYER_K_: layers.EdgeConv,
         MODES_K_: [SINGLE],
         KWARGS_K_: {'channels': 8, 'activation': 'relu', 'mlp_hidden': [16],
                     'sparse': [True]}
     },
     {
-        LAYER_K_: MessagePassing,
+        LAYER_K_: layers.GeneralConv,
+        MODES_K_: [SINGLE],
+        KWARGS_K_: {'channels': 256, 'sparse': [True]}
+    },
+    {
+        LAYER_K_: layers.MessagePassing,
         MODES_K_: [SINGLE],
         KWARGS_K_: {'channels': F, 'sparse': [True]}
     },

From adb7b074e73d9314a8b8c1a8d8fa55c48904aec8 Mon Sep 17 00:00:00 2001
From: Daniele Grattarola <daniele.grattarola@gmail.com>
Date: Thu, 26 Nov 2020 17:01:39 +0100
Subject: [PATCH 44/57] Add ModelNet dataset Add ClusteringCoeff and Delaunay
 transforms Improve docs

---
 README.md                                     |  2 +-
 docs/autogen.py                               |  4 +
 docs/templates/getting-started.md             | 12 +--
 docs/templates/layers/convolution.md          | 38 +++-----
 docs/templates/layers/pooling.md              | 12 +--
 spektral/datasets/__init__.py                 |  1 +
 spektral/datasets/citation.py                 |  8 +-
 spektral/datasets/graphsage.py                | 14 +--
 spektral/datasets/mnist.py                    |  2 +-
 spektral/datasets/modelnet.py                 | 93 +++++++++++++++++++
 spektral/datasets/ogb.py                      |  4 +-
 spektral/datasets/tudataset.py                |  5 +-
 spektral/layers/convolutional/agnn_conv.py    | 12 ++-
 spektral/layers/convolutional/appnp_conv.py   | 10 +-
 spektral/layers/convolutional/arma_conv.py    | 13 +--
 spektral/layers/convolutional/cheb_conv.py    | 12 ++-
 spektral/layers/convolutional/crystal_conv.py | 17 ++--
 .../layers/convolutional/diffusion_conv.py    | 22 ++---
 spektral/layers/convolutional/ecc_conv.py     | 12 ++-
 spektral/layers/convolutional/edge_conv.py    |  9 +-
 spektral/layers/convolutional/gat_conv.py     | 27 ++----
 .../layers/convolutional/gated_graph_conv.py  | 13 +--
 spektral/layers/convolutional/gcn_conv.py     |  8 +-
 spektral/layers/convolutional/gcs_conv.py     |  2 +-
 spektral/layers/convolutional/general_conv.py | 15 +--
 spektral/layers/convolutional/gin_conv.py     | 13 ++-
 .../layers/convolutional/graphsage_conv.py    | 10 +-
 .../layers/convolutional/message_passing.py   | 10 +-
 spektral/layers/convolutional/tag_conv.py     |  6 +-
 spektral/layers/pooling/diff_pool.py          |  9 +-
 spektral/layers/pooling/global_pool.py        |  6 +-
 spektral/layers/pooling/mincut_pool.py        |  8 +-
 spektral/layers/pooling/sag_pool.py           |  9 +-
 spektral/layers/pooling/topk_pool.py          | 15 ++-
 spektral/models/general_gnn.py                | 11 +--
 spektral/transforms/__init__.py               |  4 +-
 spektral/transforms/clustering_coefficient.py | 22 +++++
 spektral/transforms/constant.py               |  3 -
 spektral/transforms/degree.py                 |  3 -
 spektral/transforms/delaunay.py               | 20 ++++
 spektral/transforms/gcn_filter.py             |  3 +-
 spektral/transforms/normalize_adj.py          |  3 +-
 spektral/transforms/normalize_one.py          |  1 -
 spektral/transforms/normalize_sphere.py       |  3 +-
 spektral/utils/io.py                          | 48 +++++++++-
 tests/test_transforms/test_transforms.py      | 41 +++++---
 46 files changed, 403 insertions(+), 212 deletions(-)
 create mode 100644 spektral/datasets/modelnet.py
 create mode 100644 spektral/transforms/clustering_coefficient.py
 create mode 100644 spektral/transforms/delaunay.py

diff --git a/README.md b/README.md
index 03a22cf5..42e73f60 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@
 Spektral is a Python library for graph deep learning, based on the Keras API and TensorFlow 2.
 The main goal of this project is to provide a simple but flexible framework for creating graph neural networks (GNNs).
 
-You can use Spektral for classifying the nodes of a network, predicting molecular properties, generating new graphs with GANs, clustering nodes, predicting links, and any other task where data is described by graphs. 
+You can use Spektral for classifying the userss of a social network, predicting molecular properties, generating new graphs with GANs, clustering nodes, predicting links, and any other task where data is described by graphs. 
 
 Spektral implements some of the most popular layers for graph deep learning, including: 
 
diff --git a/docs/autogen.py b/docs/autogen.py
index c3d9c2b8..fede9715 100644
--- a/docs/autogen.py
+++ b/docs/autogen.py
@@ -141,6 +141,8 @@
             datasets.GraphSage,
             datasets.PPI,
             datasets.Reddit,
+            datasets.MNIST,
+            datasets.ModelNet,
             datasets.OGB,
             datasets.QM9,
             datasets.TUDataset,
@@ -153,7 +155,9 @@
         'methods': [],
         'classes': [
             transforms.AdjToSpTensor,
+            transforms.ClusteringCoeff,
             transforms.Constant,
+            transforms.Delaunay,
             transforms.Degree,
             transforms.GCNFilter,
             transforms.LayerPreprocess,
diff --git a/docs/templates/getting-started.md b/docs/templates/getting-started.md
index 75c8a693..c3bde713 100644
--- a/docs/templates/getting-started.md
+++ b/docs/templates/getting-started.md
@@ -80,7 +80,7 @@ First, we compute the maximum degree of the dataset, so that we know the size of
 12
 ```
 
-Try to go over the lambda function to see what it does. Also, notice that we passed another function to the `reduce` keyword. Can you guess why?
+Try to go over the lambda function to see what it does. Also, notice that we passed another function to the method with the `reduce` keyword. Can you guess why?
 
 Now we are ready to augment our node features with the one-hot-encoded degree. Spektral has a lot of pre-implemented `transforms` that we can use: 
 
@@ -90,16 +90,16 @@ Now we are ready to augment our node features with the one-hot-encoded degree. S
 >>> dataset.apply(Degree(max_degree))
 ```
 
-We can see that it worked because now we have and extra `max_degree + 1` node features, which are our one-hot vectors:
+We can see that it worked because now we have an extra `max_degree + 1` node features, which are our one-hot vectors:
 
 ```python
 >>> dataset[0]
 Graph(n_nodes=42, n_node_features=17, n_edge_features=None, y=[1. 0.])
 ```
 
-Since we will be using a `GraphConv` layer in our GNN, we also want to follow the [original paper](https://arxiv.org/abs/1609.02907) that introduced this layer, and do some extra pre-processing. 
+Since we will be using a `GCNConv` layer in our GNN, we also want to follow the [original paper](https://arxiv.org/abs/1609.02907) that introduced this layer, and do some extra pre-processing of the adjacency matrix. 
 
-Specifically, we need to normalize the adjacency matrix of each graph by the node degrees. Since this is a fairly common operation, Spektral has a transform to do it: 
+Since this is a fairly common operation, Spektral has a transform to do it: 
 
 ```python
 >>> from spektral.transforms import GCNFilter
@@ -133,7 +133,7 @@ class MyFirstGNN(Model):
 
     def __init__(self, n_hidden, n_labels):
         super().__init__()
-        self.graph_conv = GraphConv(n_hidden)
+        self.graph_conv = GCNConv(n_hidden)
         self.pool = GlobalSumPool()
         self.dropout = Dropout(0.5)
         self.dense = Dense(n_labels, 'softmax')
@@ -226,7 +226,7 @@ print('Test loss: {}'.format(loss))
 
 Besides learning to predict labels for the whole graph, like in this tutorial, GNNs are very effective at learning to predict labels for each individual node. This is called "node-level learning" and we usually do it for datasets with one big graph (think a social network).
 
-For example, reproducing the results of the [GCN paper for classifying nodes in a citation network](https://arxiv.org/abs/1609.02907) can be done with `GraphConv` layers, the `Citation` dataset, and a `SingleLoader`: check out [this example](https://github.com/danielegrattarola/spektral/blob/master/examples/node_prediction/citation_gcn.py).
+For example, reproducing the results of the [GCN paper for classifying nodes in a citation network](https://arxiv.org/abs/1609.02907) can be done with `GCNConv` layers, the `Citation` dataset, and a `SingleLoader`: check out [this example](https://github.com/danielegrattarola/spektral/blob/master/examples/node_prediction/citation_gcn.py).
 
 As a matter of fact, check out [all the examples](/examples).
 
diff --git a/docs/templates/layers/convolution.md b/docs/templates/layers/convolution.md
index 0b9eec20..f6c3ad88 100644
--- a/docs/templates/layers/convolution.md
+++ b/docs/templates/layers/convolution.md
@@ -1,35 +1,21 @@
 # Convolutional layers
 
-The message-passing layers from these papers are available in Spektral:
-
-- [Semi-Supervised Classification with Graph Convolutional Networks](https://arxiv.org/abs/1609.02907)
-- [Convolutional Neural Networks on Graphs with Fast Localized Spectral Filtering](https://arxiv.org/abs/1606.09375)
-- [Inductive Representation Learning on Large Graphs](https://arxiv.org/abs/1706.02216)
-- [Graph Neural Networks with convolutional ARMA filters](https://arxiv.org/abs/1901.01343)
-- [Dynamic Edge-Conditioned Filters in Convolutional Neural Networks on Graphs](https://arxiv.org/abs/1704.02901)
-- [Graph Attention Networks](https://arxiv.org/abs/1710.10903)
-- [Predict then Propagate: Graph Neural Networks meet Personalized PageRank](https://arxiv.org/abs/1810.05997)
-- [How Powerful are Graph Neural Networks?](https://arxiv.org/abs/1810.00826)
-- [Diffusion Convolutional Recurrent Neural Network: Data-Driven Traffic Forecasting](https://arxiv.org/abs/1707.01926)
-- [Gated Graph Sequence Neural Networks](https://arxiv.org/abs/1511.05493)
-- [Attention-based Graph Neural Network for Semi-supervised Learning](https://arxiv.org/abs/1803.03735)
-- [Topology Adaptive Graph Convolutional Networks](https://arxiv.org/abs/1710.10370)
-- [Crystal Graph Convolutional Neural Networks for an Accurate and Interpretable Prediction of Material Properties](https://journals.aps.org/prl/abstract/10.1103/PhysRevLett.120.145301)
-- [Dynamic Graph CNN for Learning on Point Clouds](https://arxiv.org/abs/1801.07829)
+The following convolutional/message-passing layers are available in Spektral.
 
 Notation:
 
-- \( N \): number of nodes in the graph;
-- \( F \): dimension of the node attributes (i.e., each node has an attribute in \( \mathbb{R}^F \));
-- \( S \): dimension of the edge attributes (i.e., each edge has an attribute in \( \mathbb{R}^S \));
-- \( \A \in \{0, 1\}^{N \times N}\): binary adjacency matrix;
-- \( \X \in \mathbb{R}^{ N \times F } \): node attributes matrix;
-- \( \E \in \mathbb{R}^{ N \times N \times S } \): edge attributes matrix;
-- \( \D = \textrm{diag} ( \sum\limits_{j=0} \A_{ij} )\): degree matrix;
-- \( \W, \V \): trainable kernels;
+- \( N \): number of nodes;
+- \( F \): size of the node attributes;
+- \( S \): size of the edge attributes;
+- \( \x_i \): node attributes of the i-th node;
+- \( \e_{i \rightarrow j}\): edge attributes of the edge from node i to node j;
+- \( \A \): adjacency matrix;
+- \( \X \): node attributes matrix;
+- \( \E \): edge attributes matrix;
+- \( \D \): degree matrix;
+- \( \W, \V \): trainable weights matrices;
 - \( \b \): trainable bias vector;
-- \( \mathcal{N}(i) \): the one-hop neighbourhood of node \(i\); 
-- \( F' \): dimension of the node attributes after a message-passing layer;
+- \( \mathcal{N}(i) \): one-hop neighbourhood of node \(i\); 
 
 ---
 
diff --git a/docs/templates/layers/pooling.md b/docs/templates/layers/pooling.md
index 40ce6d1f..ece8f1ed 100644
--- a/docs/templates/layers/pooling.md
+++ b/docs/templates/layers/pooling.md
@@ -1,16 +1,6 @@
 # Pooling layers
 
-The pooling layers from these papers are available in Spektral:
-
-- [Hierarchical Graph Representation Learning with Differentiable Pooling](https://arxiv.org/abs/1806.08804)
-- [Mincut pooling in Graph Neural Networks](https://arxiv.org/abs/1907.00481)
-- [Graph U-Nets](http://proceedings.mlr.press/v97/gao19a/gao19a.pdf)
-- [Self-Attention Graph Pooling](https://arxiv.org/abs/1904.08082)
-- [Gated Graph Sequence Neural Networks](https://arxiv.org/abs/1511.05493)
-
-Additionally, sum, average, and max global pooling are implemented, as well as 
-a simple global weighted sum pooling where weights are calculated with an 
-attention mechanism. 
+The following pooling layers are available in Spektral.
 
 See [the convolutional layers page](/layers/convolution) for the notation. 
 
diff --git a/spektral/datasets/__init__.py b/spektral/datasets/__init__.py
index 38387b2d..d6419daf 100644
--- a/spektral/datasets/__init__.py
+++ b/spektral/datasets/__init__.py
@@ -1,6 +1,7 @@
 from .citation import Citation
 from .graphsage import GraphSage, PPI, Reddit
 from .mnist import MNIST
+from .modelnet import ModelNet
 from .ogb import OGB
 from .qm9 import QM9
 from .tudataset import TUDataset
diff --git a/spektral/datasets/citation.py b/spektral/datasets/citation.py
index e3a038e8..78931496 100644
--- a/spektral/datasets/citation.py
+++ b/spektral/datasets/citation.py
@@ -39,8 +39,8 @@ class Citation(Dataset):
 
     def __init__(self, name, random_split=False, normalize_x=False, **kwargs):
         self.name = name.lower()
-        if self.name not in self.available_datasets():
-            raise ValueError('Unknown dataset {}. See Citation.available_datasets() '
+        if self.name not in self.available_datasets:
+            raise ValueError('Unknown dataset {}. See Citation.available_datasets '
                              'for a list of available datasets.')
         self.random_split = random_split
         self.normalize_x = normalize_x
@@ -116,8 +116,8 @@ def download(self):
             with open(os.path.join(self.path, f_name), 'wb') as out_file:
                 out_file.write(req.content)
 
-    @staticmethod
-    def available_datasets():
+    @property
+    def available_datasets(self):
         return ['cora', 'citeseer', 'pubmed']
 
 
diff --git a/spektral/datasets/graphsage.py b/spektral/datasets/graphsage.py
index 314d264d..dcdc14e0 100644
--- a/spektral/datasets/graphsage.py
+++ b/spektral/datasets/graphsage.py
@@ -15,8 +15,10 @@
 
 class GraphSage(Dataset):
     """
-    The datasets used in the GraphSage paper 
-    [(Hamilton & Ying (2017))](https://arxiv.org/abs/1706.02216): PPI and Reddit.
+    The datasets used in the paper
+
+    > [Inductive Representation Learning on Large Graphs](https://arxiv.org/abs/1706.02216)<br>
+    > William L. Hamilton et al.
 
     The PPI dataset (originally
     [Stark et al. (2006)](https://www.ncbi.nlm.nih.gov/pubmed/16381927))
@@ -51,9 +53,9 @@ class GraphSage(Dataset):
     url = 'http://snap.stanford.edu/graphsage/{}.zip'
 
     def __init__(self, name, **kwargs):
-        if name.lower() not in self.available_datasets():
+        if name.lower() not in self.available_datasets:
             raise ValueError('Unknown dataset: {}. Possible: {}'
-                             .format(name, self.available_datasets()))
+                             .format(name, self.available_datasets))
         self.name = name.lower()
         self.mask_tr = self.mask_va = self.mask_te = None
         super().__init__(**kwargs)
@@ -107,8 +109,8 @@ def download(self):
                  adj_col=adj.col, adj_shape=adj.shape, y=y,
                  mask_tr=mask_tr, mask_va=mask_va, mask_te=mask_te)
 
-    @staticmethod
-    def available_datasets():
+    @property
+    def available_datasets(self):
         return ['ppi', 'reddit']
 
 
diff --git a/spektral/datasets/mnist.py b/spektral/datasets/mnist.py
index 469ee715..0a142a4d 100644
--- a/spektral/datasets/mnist.py
+++ b/spektral/datasets/mnist.py
@@ -10,7 +10,7 @@
 
 class MNIST(Dataset):
     """
-    The MNIST dataset used as node features for a grid graph, as described by
+    The MNIST images used as node features for a grid graph, as described by
     [Defferrard et al. (2016)](https://arxiv.org/abs/1606.09375).
 
     This dataset is a graph signal classification task, where graphs are
diff --git a/spektral/datasets/modelnet.py b/spektral/datasets/modelnet.py
new file mode 100644
index 00000000..6709e9ba
--- /dev/null
+++ b/spektral/datasets/modelnet.py
@@ -0,0 +1,93 @@
+import os
+import os.path as osp
+import shutil
+import zipfile
+from glob import glob
+
+import requests
+from joblib import Parallel, delayed
+from tqdm import tqdm
+
+from spektral.data import Dataset
+from spektral.utils import one_hot, load_off
+
+
+class ModelNet(Dataset):
+    """
+    The ModelNet10 and ModelNet40 CAD models datasets from the paper:
+
+    > [3D ShapeNets: A Deep Representation for Volumetric Shapes](https://arxiv.org/abs/1406.5670)<br>
+    > Zhirong Wu et al.
+
+    Each graph represents a CAD model belonging to one of 10 (or 40) categories.
+
+    The models are polygon meshes: the node attributes are the 3d coordinates
+    of the vertices, and edges are computed from each face. Duplicate edges are
+    ignored and the adjacency matrix is binary.
+
+    The dataset are pre-split into training and test sets: the `test` flag
+    controls which split is loaded.
+
+    **Arguments**
+
+    - `name`: name of the dataset to load ('10' or '40');
+    - `test`: if True, load the test set instead of the training set.
+    - `n_jobs`: number of CPU cores to use for reading the data (-1, to use all
+    available cores)
+    """
+    url = {'10': 'http://vision.princeton.edu/projects/2014/3DShapeNets/ModelNet10.zip',
+           '40': 'http://modelnet.cs.princeton.edu/ModelNet40.zip'}
+
+    def __init__(self, name, test=False, n_jobs=-1, **kwargs):
+        if name not in self.available_datasets:
+            raise ValueError('Unknown dataset {}. Possible: {}'
+                             .format(name, self.available_datasets))
+        self.name = name
+        self.test = test
+        self.n_jobs = n_jobs
+        self.true_path = osp.join(self.path, 'ModelNet' + self.name)
+        super().__init__(**kwargs)
+
+    def read(self):
+        folders = glob(osp.join(self.true_path, '*', ''))
+        dataset = 'test' if self.test else 'train'
+        classes = [f.split('/')[-2] for f in folders]
+        n_out = len(classes)
+
+        print('Loading data')
+
+        def load(fname):
+            graph = load_off(fname)
+            graph.y = one_hot(i, n_out)
+            return graph
+
+        output = []
+        for i, c in enumerate(tqdm(classes)):
+            fnames = osp.join(self.true_path, c, dataset, '{}_*.off'.format(c))
+            fnames = glob(fnames)
+            output_partial = Parallel(n_jobs=self.n_jobs)(
+                delayed(load)(fname) for fname in fnames)
+            output.extend(output_partial)
+
+        return output
+
+    def download(self):
+        print('Downloading ModelNet{} dataset.'.format(self.name))
+        url = self.url[self.name]
+        req = requests.get(url)
+        if req.status_code == 404:
+            raise ValueError('Cannot download dataset ({} returned 404).'
+                             .format(self.url))
+        os.makedirs(self.path, exist_ok=True)
+
+        fname = osp.join(self.path, 'ModelNet' + self.name + '.zip')
+        with open(fname, 'wb') as of:
+            of.write(req.content)
+        with zipfile.ZipFile(fname, 'r') as of:
+            of.extractall(self.path)
+
+        shutil.rmtree(osp.join(self.path, '__MACOSX'), ignore_errors=True)
+
+    @property
+    def available_datasets(self):
+        return ['10', '40']
diff --git a/spektral/datasets/ogb.py b/spektral/datasets/ogb.py
index 42fd3b96..92589aac 100644
--- a/spektral/datasets/ogb.py
+++ b/spektral/datasets/ogb.py
@@ -6,11 +6,11 @@
 
 class OGB(Dataset):
     """
-    Wrapper for OGB datasets.
+    Wrapper for datasets from the [Open Graph Benchmark (OGB)](https://ogb.stanford.edu/).
 
     **Arguments**
 
-    - `dataset`: an OGB library-agnostic Graph*Dataset object.
+    - `dataset`: an OGB library-agnostic dataset.
 
     """
     def __init__(self, dataset, **kwargs):
diff --git a/spektral/datasets/tudataset.py b/spektral/datasets/tudataset.py
index 8dc8a3a1..e0cf4315 100644
--- a/spektral/datasets/tudataset.py
+++ b/spektral/datasets/tudataset.py
@@ -43,7 +43,7 @@ class TUDataset(Dataset):
 
     **Arguments**
 
-    - `name`: str, name of the dataset to load (see `TUD.available_datasets()`).
+    - `name`: str, name of the dataset to load (see `TUD.available_datasets`).
     - `clean`: if `True`, rload a version of the dataset with no isomorphic
                graphs.
     """
@@ -179,13 +179,14 @@ def read(self):
         return [Graph(x=x, a=a, e=e, y=y)
                 for x, a, e, y in zip(x_list, a_list, e_list, labels)]
 
+    @property
     def available_datasets(self):
         try:
             names = pd.read_html(self.url)[0].Name[2:-1].values.tolist()
             return [d[:-4] for d in names]
         except URLError:
             # No internet, don't panic
-            print('No connection. See {}'.format(self.url))
+            print('Could not read URL {}'.format(self.url))
             return []
 
 
diff --git a/spektral/layers/convolutional/agnn_conv.py b/spektral/layers/convolutional/agnn_conv.py
index 70160d75..9451259b 100644
--- a/spektral/layers/convolutional/agnn_conv.py
+++ b/spektral/layers/convolutional/agnn_conv.py
@@ -7,8 +7,10 @@
 
 class AGNNConv(MessagePassing):
     r"""
-    An Attention-based Graph Neural Network (AGNN) as presented by
-    [Thekumparampil et al. (2018)](https://arxiv.org/abs/1803.03735).
+    An Attention-based Graph Neural Network (AGNN) from the paper
+
+    > [Attention-based Graph Neural Network for Semi-supervised Learning](https://arxiv.org/abs/1803.03735)<br>
+    > Kiran K. Thekumparampil et al.
 
     **Mode**: single, disjoint.
 
@@ -16,15 +18,15 @@ class AGNNConv(MessagePassing):
 
     This layer computes:
     $$
-        \Z = \P\X
+        \X' = \P\X
     $$
     where
     $$
         \P_{ij} = \frac{
-            \exp \left( \beta \cos \left( \X_i, \X_j \right) \right)
+            \exp \left( \beta \cos \left( \x_i, \x_j \right) \right)
         }{
             \sum\limits_{k \in \mathcal{N}(i) \cup \{ i \}}
-            \exp \left( \beta \cos \left( \X_i, \X_k \right) \right)
+            \exp \left( \beta \cos \left( \x_i, \x_k \right) \right)
         }
     $$
     and \(\beta\) is a trainable parameter.
diff --git a/spektral/layers/convolutional/appnp_conv.py b/spektral/layers/convolutional/appnp_conv.py
index 674be779..dc4e2026 100644
--- a/spektral/layers/convolutional/appnp_conv.py
+++ b/spektral/layers/convolutional/appnp_conv.py
@@ -8,8 +8,10 @@
 
 class APPNPConv(GCNConv):
     r"""
-    A graph convolutional layer implementing the APPNP operator, as presented by
-    [Klicpera et al. (2019)](https://arxiv.org/abs/1810.05997).
+    The APPNP operator from the paper
+
+    > [Predict then Propagate: Graph Neural Networks meet Personalized PageRank](https://arxiv.org/abs/1810.05997)<br>
+    > Johannes Klicpera et al.
 
     This layer computes:
     $$
@@ -17,8 +19,8 @@ class APPNPConv(GCNConv):
         \Z^{(K)} = (1 - \alpha) \hat \D^{-1/2} \hat \A \hat \D^{-1/2} \Z^{(K - 1)} +
                    \alpha \Z^{(0)},
     $$
-    where \(\alpha\) is the _teleport_ probability and \(\textrm{MLP}\) is a
-    multi-layer perceptron.
+    where \(\alpha\) is the teleport probability, \(\textrm{MLP}\) is a
+    multi-layer perceptron, and \(K\) is defined by the `propagations` argument.
 
     **Mode**: single, disjoint, mixed, batch.
 
diff --git a/spektral/layers/convolutional/arma_conv.py b/spektral/layers/convolutional/arma_conv.py
index c8886c59..e7fc8cca 100644
--- a/spektral/layers/convolutional/arma_conv.py
+++ b/spektral/layers/convolutional/arma_conv.py
@@ -8,27 +8,28 @@
 
 class ARMAConv(GCNConv):
     r"""
-    A graph convolutional layer with ARMA\(_K\) filters, as presented by
-    [Bianchi et al. (2019)](https://arxiv.org/abs/1901.01343).
+    An Auto-Regressive Moving Average convolutional layer (ARMA) from the paper
+
+    > [Graph Neural Networks with convolutional ARMA filters](https://arxiv.org/abs/1901.01343)<br>
+    > Filippo Maria Bianchi et al.
 
     **Mode**: single, disjoint, mixed, batch.
 
     This layer computes:
     $$
-        \Z = \frac{1}{K} \sum\limits_{k=1}^K \bar\X_k^{(T)},
+        \X' = \frac{1}{K} \sum\limits_{k=1}^K \bar\X_k^{(T)},
     $$
     where \(K\) is the order of the ARMA\(_K\) filter, and where:
     $$
         \bar \X_k^{(t + 1)} =
-        \sigma \left(\tilde \L \bar \X^{(t)} \W^{(t)} + \X \V^{(t)} \right)
+        \sigma \left(\tilde \A \bar \X^{(t)} \W^{(t)} + \X \V^{(t)} \right)
     $$
     is a recursive approximation of an ARMA\(_1\) filter, where
     \( \bar \X^{(0)} = \X \)
     and
     $$
-        \tilde \L =  \frac{2}{\lambda_{max}} \cdot (\I - \D^{-1/2} \A \D^{-1/2}) - \I
+        \tilde \A =  \D^{-1/2} \A \D^{-1/2}.
     $$
-    is the normalized Laplacian with a rescaled spectrum.
 
     **Input**
 
diff --git a/spektral/layers/convolutional/cheb_conv.py b/spektral/layers/convolutional/cheb_conv.py
index efbe3548..38a3fd68 100644
--- a/spektral/layers/convolutional/cheb_conv.py
+++ b/spektral/layers/convolutional/cheb_conv.py
@@ -7,14 +7,17 @@
 
 class ChebConv(GCNConv):
     r"""
-    A Chebyshev convolutional layer as presented by
-    [Defferrard et al. (2016)](https://arxiv.org/abs/1606.09375).
+    A Chebyshev convolutional layer from the paper
+
+    > [Convolutional Neural Networks on Graphs with Fast Localized Spectral
+  Filtering](https://arxiv.org/abs/1606.09375)<br>
+    > Michaël Defferrard et al.
 
     **Mode**: single, disjoint, mixed, batch.
 
     This layer computes:
     $$
-        \Z = \sum \limits_{k=0}^{K - 1} \T^{(k)} \W^{(k)}  + \b^{(k)},
+        \X' = \sum \limits_{k=0}^{K - 1} \T^{(k)} \W^{(k)}  + \b^{(k)},
     $$
     where \( \T^{(0)}, ..., \T^{(K - 1)} \) are Chebyshev polynomials of \(\tilde \L\)
     defined as
@@ -25,9 +28,8 @@ class ChebConv(GCNConv):
     $$
     where
     $$
-        \tilde \L =  \frac{2}{\lambda_{max}} \cdot (\I - \D^{-1/2} \A \D^{-1/2}) - \I
+        \tilde \L =  \frac{2}{\lambda_{max}} \cdot (\I - \D^{-1/2} \A \D^{-1/2}) - \I.
     $$
-    is the normalized Laplacian with a rescaled spectrum.
 
     **Input**
 
diff --git a/spektral/layers/convolutional/crystal_conv.py b/spektral/layers/convolutional/crystal_conv.py
index 410874e2..aeb7f1b8 100644
--- a/spektral/layers/convolutional/crystal_conv.py
+++ b/spektral/layers/convolutional/crystal_conv.py
@@ -6,20 +6,21 @@
 
 class CrystalConv(MessagePassing):
     r"""
-    A Crystal Graph Convolutional layer as presented by
-    [Xie & Grossman (2018)](https://journals.aps.org/prl/abstract/10.1103/PhysRevLett.120.145301).
+    A crystal graph convolutional layer from the paper
+
+    > [Crystal Graph Convolutional Neural Networks for an Accurate and
+    Interpretable Prediction of Material Properties](https://arxiv.org/abs/1710.10324)<br>
+    > Tian Xie and Jeffrey C. Grossman
 
     **Mode**: single, disjoint.
 
     **This layer expects a sparse adjacency matrix.**
 
-    This layer computes for each node \(i\):
+    This layer computes:
     $$
-        \H_i = \X_i +
-               \sum\limits_{j \in \mathcal{N}(i)}
-                    \sigma \left( \z_{ij} \W^{(f)} + \b^{(f)} \right)
-                    \odot
-                    \g \left( \z_{ij} \W^{(s)} + \b^{(s)} \right)
+        \x_i' = \x_i + \sum\limits_{j \in \mathcal{N}(i)} \sigma \left( \z_{ij}
+        \W^{(f)} + \b^{(f)} \right) \odot \g \left( \z_{ij} \W^{(s)} + \b^{(s)}
+        \right)
     $$
     where \(\z_{ij} = \X_i \| \X_j \| \E_{ij} \), \(\sigma\) is a sigmoid
     activation, and \(g\) is the activation function (defined by the `activation`
diff --git a/spektral/layers/convolutional/diffusion_conv.py b/spektral/layers/convolutional/diffusion_conv.py
index 35d17144..e8f2aba0 100644
--- a/spektral/layers/convolutional/diffusion_conv.py
+++ b/spektral/layers/convolutional/diffusion_conv.py
@@ -71,24 +71,22 @@ def call(self, inputs):
 
 
 class DiffusionConv(GCNConv):
-    r"""Applies Graph Diffusion Convolution as descibed by
-    [Li et al. (2016)](https://arxiv.org/pdf/1707.01926.pdf)
+    r"""
+    A diffusion convolution operator from the paper
+
+    > [Diffusion Convolutional Recurrent Neural Network: Data-Driven Traffic
+  Forecasting](https://arxiv.org/abs/1707.01926)<br>
+    > Yaguang Li et al.
 
     **Mode**: single, disjoint, mixed, batch.
 
     **This layer expects a dense adjacency matrix.**
 
-    Given a number of diffusion steps \(K\) and a row normalized adjacency matrix \(\hat \A \),
-    this layer calculates the q'th channel as:
-
+    Given a number of diffusion steps \(K\) and a row-normalized adjacency
+    matrix \(\hat \A \), this layer calculates the \(q\)-th channel as:
     $$
-    \mathbf{H}_{~:,~q} = \sigma\left(
-        \sum_{f=1}^{n_node_features}
-            \left(
-                \sum_{k=0}^{K-1}\theta_k {\hat \A}^k
-            \right)
-        \X_{~:,~f}
-    \right)
+    \mathbf{X}_{~:,~q}' = \sigma\left( \sum_{f=1}^{F} \left( \sum_{k=0}^{K-1}
+    \theta_k {\hat \A}^k \right) \X_{~:,~f} \right)
     $$
 
     **Input**
diff --git a/spektral/layers/convolutional/ecc_conv.py b/spektral/layers/convolutional/ecc_conv.py
index 4b5925e0..06728d84 100644
--- a/spektral/layers/convolutional/ecc_conv.py
+++ b/spektral/layers/convolutional/ecc_conv.py
@@ -9,8 +9,11 @@
 
 class ECCConv(GCNConv):
     r"""
-    An edge-conditioned convolutional layer (ECC) as presented by
-    [Simonovsky & Komodakis (2017)](https://arxiv.org/abs/1704.02901).
+    An edge-conditioned convolutional layer (ECC) from the paper
+
+    > [Dynamic Edge-Conditioned Filters in Convolutional Neural Networks on
+  Graphs](1704.02901)<br>
+    > Martin Simonovsky and Nikos Komodakis
 
     **Mode**: single, disjoint, batch.
 
@@ -19,9 +22,10 @@ class ECCConv(GCNConv):
         - In single mode, if the adjacency matrix is dense it will be converted
         to a SparseTensor automatically (which is an expensive operation).
 
-    For each node \( i \), this layer computes:
+    This layer computes:
     $$
-        \Z_i = \X_{i} \W_{\textrm{root}} + \sum\limits_{j \in \mathcal{N}(i)} \X_{j} \textrm{MLP}(\E_{ji}) + \b
+        \x_i' = \x_{i} \W_{\textrm{root}} + \sum\limits_{j \in \mathcal{N}(i)}
+        \x_{j} \textrm{MLP}(\e_{j \rightarrow i}) + \b
     $$
     where \(\textrm{MLP}\) is a multi-layer perceptron that outputs an
     edge-specific weight as a function of edge attributes.
diff --git a/spektral/layers/convolutional/edge_conv.py b/spektral/layers/convolutional/edge_conv.py
index bc7d90e3..34608ced 100644
--- a/spektral/layers/convolutional/edge_conv.py
+++ b/spektral/layers/convolutional/edge_conv.py
@@ -7,8 +7,10 @@
 
 class EdgeConv(MessagePassing):
     r"""
-    An Edge Convolutional layer as presented by
-    [Wang et al. (2018)](https://arxiv.org/abs/1801.07829).
+    An edge convolutional layer from the paper
+
+    > [Dynamic Graph CNN for Learning on Point Clouds](https://arxiv.org/abs/1801.07829)<br>
+    > Yue Wang et al.
 
     **Mode**: single, disjoint.
 
@@ -16,7 +18,8 @@ class EdgeConv(MessagePassing):
 
     This layer computes for each node \(i\):
     $$
-        \Z_i = \sum\limits_{j \in \mathcal{N}(i)} \textrm{MLP}\big( \X_i \| \X_j - \X_i \big)
+        \x_i' = \sum\limits_{j \in \mathcal{N}(i)} \textrm{MLP}\big( \x_i \|
+        \x_j - \x_i \big)
     $$
     where \(\textrm{MLP}\) is a multi-layer perceptron.
 
diff --git a/spektral/layers/convolutional/gat_conv.py b/spektral/layers/convolutional/gat_conv.py
index 02895267..ec9df941 100644
--- a/spektral/layers/convolutional/gat_conv.py
+++ b/spektral/layers/convolutional/gat_conv.py
@@ -10,8 +10,10 @@
 
 class GATConv(GCNConv):
     r"""
-    A graph attention layer (GAT) as presented by
-    [Velickovic et al. (2017)](https://arxiv.org/abs/1710.10903).
+    A Graph Attention layer (GAT) from the paper
+
+    > [Graph Attention Networks](https://arxiv.org/abs/1710.10903)<br>
+    > Petar Veličković et al.
 
     **Mode**: single, disjoint, mixed, batch.
 
@@ -21,25 +23,14 @@ class GATConv(GCNConv):
     uses the attention mechanism to weight the adjacency matrix instead of
     using the normalized Laplacian:
     $$
-        \Z = \mathbf{\alpha}\X\W + \b
+        \X' = \mathbf{\alpha}\X\W + \b
     $$
     where
     $$
-        \mathbf{\alpha}_{ij} =
-            \frac{
-                \exp\left(
-                    \mathrm{LeakyReLU}\left(
-                        \a^{\top} [(\X\W)_i \, \| \, (\X\W)_j]
-                    \right)
-                \right)
-            }
-            {\sum\limits_{k \in \mathcal{N}(i) \cup \{ i \}}
-                \exp\left(
-                    \mathrm{LeakyReLU}\left(
-                        \a^{\top} [(\X\W)_i \, \| \, (\X\W)_k]
-                    \right)
-                \right)
-            }
+        \mathbf{\alpha}_{ij} =\frac{ \exp\left(\mathrm{LeakyReLU}\left(
+        \a^{\top} [(\X\W)_i \, \| \, (\X\W)_j]\right)\right)}{\sum\limits_{k
+        \in \mathcal{N}(i) \cup \{ i \}} \exp\left(\mathrm{LeakyReLU}\left(
+        \a^{\top} [(\X\W)_i \, \| \, (\X\W)_k]\right)\right)}
     $$
     where \(\a \in \mathbb{R}^{2F'}\) is a trainable attention kernel.
     Dropout is also applied to \(\alpha\) before computing \(\Z\).
diff --git a/spektral/layers/convolutional/gated_graph_conv.py b/spektral/layers/convolutional/gated_graph_conv.py
index 92d0a4e6..3bac8070 100644
--- a/spektral/layers/convolutional/gated_graph_conv.py
+++ b/spektral/layers/convolutional/gated_graph_conv.py
@@ -7,23 +7,24 @@
 
 class GatedGraphConv(MessagePassing):
     r"""
-    A gated graph convolutional layer as presented by
-    [Li et al. (2018)](https://arxiv.org/abs/1511.05493).
+    A gated graph convolutional layer from the paper
+
+    > [Gated Graph Sequence Neural Networks](https://arxiv.org/abs/1511.05493)<br>
+    > Yujia Li et al.
 
     **Mode**: single, disjoint.
 
     **This layer expects a sparse adjacency matrix.**
 
-    This layer repeatedly applies a GRU cell \(L\) times to the node attributes
+    This layer computes \(\x_i' = \h^{(L)}_i\) where:
     $$
     \begin{align}
-        & \h^{(0)}_i = \X_i \| \mathbf{0} \\
+        & \h^{(0)}_i = \x_i \| \mathbf{0} \\
         & \m^{(l)}_i = \sum\limits_{j \in \mathcal{N}(i)} \h^{(l - 1)}_j \W \\
         & \h^{(l)}_i = \textrm{GRU} \left(\m^{(l)}_i, \h^{(l - 1)}_i \right) \\
-        & \Z_i = h^{(L)}_i
     \end{align}
     $$
-    where \(\textrm{GRU}\) is the GRU cell.
+    where \(\textrm{GRU}\) is a gated recurrent unit cell.
 
     **Input**
 
diff --git a/spektral/layers/convolutional/gcn_conv.py b/spektral/layers/convolutional/gcn_conv.py
index 4be487d5..a721acab 100644
--- a/spektral/layers/convolutional/gcn_conv.py
+++ b/spektral/layers/convolutional/gcn_conv.py
@@ -8,14 +8,16 @@
 
 class GCNConv(Layer):
     r"""
-    A graph convolutional layer (GCN) as presented by
-    [Kipf & Welling (2016)](https://arxiv.org/abs/1609.02907).
+    A graph convolutional layer (GCN) from the paper
+
+    > [Semi-Supervised Classification with Graph Convolutional Networks](https://arxiv.org/abs/1609.02907)<br>
+    > Thomas N. Kipf and Max Welling
 
     **Mode**: single, disjoint, mixed, batch.
 
     This layer computes:
     $$
-        \Z = \hat \D^{-1/2} \hat \A \hat \D^{-1/2} \X \W + \b
+        \X' = \hat \D^{-1/2} \hat \A \hat \D^{-1/2} \X \W + \b
     $$
     where \( \hat \A = \A + \I \) is the adjacency matrix with added self-loops
     and \(\hat\D\) is its degree matrix.
diff --git a/spektral/layers/convolutional/gcs_conv.py b/spektral/layers/convolutional/gcs_conv.py
index a8c83a49..3123cdf8 100644
--- a/spektral/layers/convolutional/gcs_conv.py
+++ b/spektral/layers/convolutional/gcs_conv.py
@@ -13,7 +13,7 @@ class GCSConv(GCNConv):
 
     This layer computes:
     $$
-        \Z = \D^{-1/2} \A \D^{-1/2} \X \W_1 + \X \W_2 + \b
+        \Z' = \D^{-1/2} \A \D^{-1/2} \X \W_1 + \X \W_2 + \b
     $$
     where \( \A \) does not have self-loops (unlike in GraphConv).
 
diff --git a/spektral/layers/convolutional/general_conv.py b/spektral/layers/convolutional/general_conv.py
index af5e5bf3..7cdca49c 100644
--- a/spektral/layers/convolutional/general_conv.py
+++ b/spektral/layers/convolutional/general_conv.py
@@ -7,17 +7,18 @@
 
 class GeneralConv(MessagePassing):
     r"""
-    A general convolutional layer as described by
-    [You et al.](https://arxiv.org/abs/2011.08843).
+    A general convolutional layer from the paper
+
+    > [Design Space for Graph Neural Networks](https://arxiv.org/abs/2011.08843)<br>
+    > Jiaxuan You et al.
 
     **Mode**: single, disjoint.
 
     **This layer expects a sparse adjacency matrix.**
 
     This layer computes:
-
     $$
-        \h_i = \mathrm{Agg} \left( \left\{ \mathrm{Act} \left( \mathrm{Dropout}
+        \x_i' = \mathrm{Agg} \left( \left\{ \mathrm{Act} \left( \mathrm{Dropout}
         \left( \mathrm{BN} \left( \x_j \W + \b \right) \right) \right),
         j \in \mathcal{N}(i) \right\} \right)
     $$
@@ -143,6 +144,8 @@ def get_config(self):
             'channels': self.channels,
         }
         base_config = super().get_config()
-        base_config.pop('aggregate')  # Remove it because it's defined by constructor
-        base_config['activation'] = 'prelu'
+        if isinstance(self.activation, PReLU):
+            base_config['activation'] = 'prelu'
+        else:
+            base_config['activation'] = activations.serialize(self.activation)
         return {**base_config, **config}
\ No newline at end of file
diff --git a/spektral/layers/convolutional/gin_conv.py b/spektral/layers/convolutional/gin_conv.py
index 546d110e..bae79624 100644
--- a/spektral/layers/convolutional/gin_conv.py
+++ b/spektral/layers/convolutional/gin_conv.py
@@ -7,8 +7,10 @@
 
 class GINConv(MessagePassing):
     r"""
-    A Graph Isomorphism Network (GIN) as presented by
-    [Xu et al. (2018)](https://arxiv.org/abs/1810.00826).
+    A Graph Isomorphism Network (GIN) from the paper
+
+    > [How Powerful are Graph Neural Networks?](https://arxiv.org/abs/1810.00826)<br>
+    > Keyulu Xu et al.
 
     **Mode**: single, disjoint.
 
@@ -16,7 +18,8 @@ class GINConv(MessagePassing):
 
     This layer computes for each node \(i\):
     $$
-        \Z_i = \textrm{MLP}\big( (1 + \epsilon) \cdot \X_i + \sum\limits_{j \in \mathcal{N}(i)} \X_j \big)
+        \x_i' = \textrm{MLP}\big( (1 + \epsilon) \cdot \x_i + \sum\limits_{j
+        \in \mathcal{N}(i)} \x_j \big)
     $$
     where \(\textrm{MLP}\) is a multi-layer perceptron.
 
@@ -33,8 +36,8 @@ class GINConv(MessagePassing):
     **Arguments**
 
     - `channels`: integer, number of output channels;
-    - `epsilon`: unnamed parameter, see
-    [Xu et al. (2018)](https://arxiv.org/abs/1810.00826), and the equation above.
+    - `epsilon`: unnamed parameter, see the original paper and the equation
+    above.
     By setting `epsilon=None`, the parameter will be learned (default behaviour).
     If given as a value, the parameter will stay fixed.
     - `mlp_hidden`: list of integers, number of hidden units for each hidden
diff --git a/spektral/layers/convolutional/graphsage_conv.py b/spektral/layers/convolutional/graphsage_conv.py
index 0469bf7f..0e4e1541 100644
--- a/spektral/layers/convolutional/graphsage_conv.py
+++ b/spektral/layers/convolutional/graphsage_conv.py
@@ -6,8 +6,10 @@
 
 class GraphSageConv(MessagePassing):
     r"""
-    A GraphSAGE layer as presented by
-    [Hamilton et al. (2017)](https://arxiv.org/abs/1706.02216).
+    A GraphSAGE layer from the paper
+
+    > [Inductive Representation Learning on Large Graphs](https://arxiv.org/abs/1706.02216)<br>
+    > William L. Hamilton et al.
 
     **Mode**: single, disjoint.
 
@@ -15,8 +17,8 @@ class GraphSageConv(MessagePassing):
 
     This layer computes:
     $$
-        \Z = \big[ \textrm{AGGREGATE}(\X) \| \X \big] \W + \b; \\
-        \Z = \frac{\Z}{\|\Z\|}
+        \X' = \big[ \textrm{AGGREGATE}(\X) \| \X \big] \W + \b; \\
+        \X' = \frac{\X'}{\|\X'\|}
     $$
     where \( \textrm{AGGREGATE} \) is a function to aggregate a node's
     neighbourhood. The supported aggregation methods are: sum, mean,
diff --git a/spektral/layers/convolutional/message_passing.py b/spektral/layers/convolutional/message_passing.py
index 5605d7c5..6e2b3069 100644
--- a/spektral/layers/convolutional/message_passing.py
+++ b/spektral/layers/convolutional/message_passing.py
@@ -10,8 +10,10 @@
 
 class MessagePassing(Layer):
     r"""
-    A general class for message passing as presented by
-    [Gilmer et al. (2017)](https://arxiv.org/abs/1704.01212).
+    A general class for message passing networks from the paper
+
+    > [Neural Message Passing for Quantum Chemistry](https://arxiv.org/abs/1704.01212)<br>
+    > Justin Gilmer et al.
 
     **Mode**: single, disjoint.
 
@@ -19,8 +21,8 @@ class MessagePassing(Layer):
 
     This layer computes:
     $$
-        \Z_i = \gamma \left( \X_i, \square_{j \in \mathcal{N}(i)} \,
-        \phi \left(\X_i, \X_j, \E_{j,i} \right) \right),
+        \x_i' = \gamma \left( \x_i, \square_{j \in \mathcal{N}(i)} \,
+        \phi \left(\x_i, \x_j, \e_{j \rightarrow i} \right) \right),
     $$
     
     where \( \gamma \) is a differentiable update function, \( \phi \) is a
diff --git a/spektral/layers/convolutional/tag_conv.py b/spektral/layers/convolutional/tag_conv.py
index 1fc2181c..68809c5d 100644
--- a/spektral/layers/convolutional/tag_conv.py
+++ b/spektral/layers/convolutional/tag_conv.py
@@ -7,8 +7,10 @@
 
 class TAGConv(MessagePassing):
     r"""
-    A Topology Adaptive Graph Convolutional layer (TAG) as presented by
-    [Du et al. (2017)](https://arxiv.org/abs/1710.10370).
+    A Topology Adaptive Graph Convolutional layer (TAG) from the paper
+
+    > [Topology Adaptive Graph Convolutional Networks](https://arxiv.org/abs/1710.10370)<br>
+    > Jian Du et al.
 
     **Mode**: single, disjoint.
 
diff --git a/spektral/layers/pooling/diff_pool.py b/spektral/layers/pooling/diff_pool.py
index 398a9191..d0a93a9a 100644
--- a/spektral/layers/pooling/diff_pool.py
+++ b/spektral/layers/pooling/diff_pool.py
@@ -9,15 +9,16 @@
 
 class DiffPool(Layer):
     r"""
-    A DiffPool layer as presented by
-    [Ying et al. (2018)](https://arxiv.org/abs/1806.08804).
+    A DiffPool layer from the paper
+
+    > [Hierarchical Graph Representation Learning with Differentiable Pooling](https://arxiv.org/abs/1806.08804)<br>
+    > Rex Ying et al.
 
     **Mode**: batch.
 
     This layer computes a soft clustering \(\S\) of the input graphs using a GNN,
     and reduces graphs as follows:
-
-    $$
+$$
         \S = \textrm{GNN}(\A, \X); \\
         \A' = \S^\top \A \S; \X' = \S^\top \X;
     $$
diff --git a/spektral/layers/pooling/global_pool.py b/spektral/layers/pooling/global_pool.py
index e341908e..b415984e 100644
--- a/spektral/layers/pooling/global_pool.py
+++ b/spektral/layers/pooling/global_pool.py
@@ -139,8 +139,10 @@ def __init__(self, **kwargs):
 
 class GlobalAttentionPool(GlobalPool):
     r"""
-    A gated attention global pooling layer as presented by
-    [Li et al. (2017)](https://arxiv.org/abs/1511.05493).
+    A gated attention global pooling layer from the paper
+
+    > [Gated Graph Sequence Neural Networks](https://arxiv.org/abs/1511.05493)<br>
+    > Yujia Li et al.
 
     This layer computes:
     $$
diff --git a/spektral/layers/pooling/mincut_pool.py b/spektral/layers/pooling/mincut_pool.py
index 1c4888a6..3094c8be 100644
--- a/spektral/layers/pooling/mincut_pool.py
+++ b/spektral/layers/pooling/mincut_pool.py
@@ -8,19 +8,19 @@
 
 class MinCutPool(Layer):
     r"""
-    A minCUT pooling layer as presented by
-    [Bianchi et al. (2019)](https://arxiv.org/abs/1907.00481).
+    A MinCut pooling layer from the paper
+
+    > [Spectral Clustering with Graph Neural Networks for Graph Pooling](https://arxiv.org/abs/1907.00481)<br>
+    > Filippo Maria Bianchi et al.
 
     **Mode**: batch.
 
     This layer computes a soft clustering \(\S\) of the input graphs using a MLP,
     and reduces graphs as follows:
-
     $$
         \S = \textrm{MLP}(\X); \\
         \A' = \S^\top \A \S; \X' = \S^\top \X;
     $$
-
     where MLP is a multi-layer perceptron with softmax output.
     Two auxiliary loss terms are also added to the model: the _minCUT loss_
     $$
diff --git a/spektral/layers/pooling/sag_pool.py b/spektral/layers/pooling/sag_pool.py
index c3825b22..2bf68c1d 100644
--- a/spektral/layers/pooling/sag_pool.py
+++ b/spektral/layers/pooling/sag_pool.py
@@ -5,14 +5,15 @@
 
 class SAGPool(TopKPool):
     r"""
-    A self-attention graph pooling layer as presented by
-    [Lee et al. (2019)](https://arxiv.org/abs/1904.08082).
+    A self-attention graph pooling layer (SAG) from the paper
+
+    > [Self-Attention Graph Pooling](https://arxiv.org/abs/1904.08082)<br>
+    > Junhyun Lee et al.
 
     **Mode**: single, disjoint.
 
     This layer computes the following operations:
-
-    $$
+$$
     \y = \textrm{GNN}(\A, \X); \;\;\;\;
     \i = \textrm{rank}(\y, K); \;\;\;\;
     \X' = (\X \odot \textrm{tanh}(\y))_\i; \;\;\;\;
diff --git a/spektral/layers/pooling/topk_pool.py b/spektral/layers/pooling/topk_pool.py
index d57f2dae..e706fa4c 100644
--- a/spektral/layers/pooling/topk_pool.py
+++ b/spektral/layers/pooling/topk_pool.py
@@ -7,15 +7,20 @@
 
 class TopKPool(Layer):
     r"""
-    A gPool/Top-K layer as presented by
-    [Gao & Ji (2019)](http://proceedings.mlr.press/v97/gao19a/gao19a.pdf) and
-    [Cangea et al. (2018)](https://arxiv.org/abs/1811.01287).
+    A gPool/Top-K layer from the papers
+
+    > [Graph U-Nets](https://arxiv.org/abs/1905.05178)<br>
+    > Hongyang Gao and Shuiwang Ji
+
+    and
+
+    > [Towards Sparse Hierarchical Graph Classifiers](https://arxiv.org/abs/1811.01287)<br>
+    > Cătălina Cangea et al.
 
     **Mode**: single, disjoint.
 
     This layer computes the following operations:
-
-    $$
+$$
     \y = \frac{\X\p}{\|\p\|}; \;\;\;\;
     \i = \textrm{rank}(\y, K); \;\;\;\;
     \X' = (\X \odot \textrm{tanh}(\y))_\i; \;\;\;\;
diff --git a/spektral/models/general_gnn.py b/spektral/models/general_gnn.py
index 256dcde4..0762c976 100644
--- a/spektral/models/general_gnn.py
+++ b/spektral/models/general_gnn.py
@@ -17,11 +17,10 @@ def get_act(identifier):
 
 class GeneralGNN(Model):
     r"""
-    This model implements the GNN architecture described in
+    This model implements the GNN architecture from the paper
 
     > [Design Space for Graph Neural Networks](https://arxiv.org/abs/2011.08843)<br>
-    > Jiaxuan You, Rex Ying, Jure Leskovec<br>
-    > NeurIPS 2020
+    > Jiaxuan You, Rex Ying, Jure Leskovec
 
     The default parameters of the model are selected according to the best
     results obtained in the paper, and should provide a good performance on
@@ -47,15 +46,13 @@ class GeneralGNN(Model):
 
     The dense layers of the pre-processing and post-processing MLPs compute the
     following update of the node features:
-
-    $$
+$$
         \h_i = \mathrm{Act} \left( \mathrm{Dropout} \left( \mathrm{BN}
         \left( \x_i \W + \b \right) \right) \right)
     $$
 
     Message-passing layers compute:
-
-    $$
+$$
         \h_i = \mathrm{Agg} \left( \left\{ \mathrm{Act} \left( \mathrm{Dropout}
         \left( \mathrm{BN} \left( \x_j \W + \b \right) \right) \right),
         j \in \mathcal{N}(i) \right\} \right)
diff --git a/spektral/transforms/__init__.py b/spektral/transforms/__init__.py
index 549af269..183bcfca 100644
--- a/spektral/transforms/__init__.py
+++ b/spektral/transforms/__init__.py
@@ -1,6 +1,8 @@
 from .adj_to_sp_tensor import AdjToSpTensor
+from .clustering_coefficient import ClusteringCoeff
 from .constant import Constant
-from .degree import Degree, MaxDegree
+from .degree import Degree
+from .delaunay import Delaunay
 from .gcn_filter import GCNFilter
 from .layer_preprocess import LayerPreprocess
 from .normalize_adj import NormalizeAdj
diff --git a/spektral/transforms/clustering_coefficient.py b/spektral/transforms/clustering_coefficient.py
new file mode 100644
index 00000000..a539c9dc
--- /dev/null
+++ b/spektral/transforms/clustering_coefficient.py
@@ -0,0 +1,22 @@
+import networkx as nx
+import numpy as np
+
+
+class ClusteringCoeff:
+    """
+    Concatenates to each node attribute the clustering coefficient of the
+    corresponding node.
+    """
+    def __call__(self, graph):
+        if 'a' not in graph:
+            raise ValueError('The graph must have an adjacency matrix')
+        clustering_coeff = nx.clustering(nx.Graph(graph.a))
+        clustering_coeff = np.array(
+            [clustering_coeff[i] for i in range(graph.n_nodes)])[:, None]
+
+        if 'x' not in graph:
+            graph.x = clustering_coeff
+        else:
+            graph.x = np.concatenate((graph.x, clustering_coeff), axis=-1)
+
+        return graph
diff --git a/spektral/transforms/constant.py b/spektral/transforms/constant.py
index 53ccd273..2126c100 100644
--- a/spektral/transforms/constant.py
+++ b/spektral/transforms/constant.py
@@ -5,9 +5,6 @@ class Constant(object):
     """
     Concatenates a constant value to the node attributes.
 
-    If the graph doesn't have node attributes, then they are created and set to
-    `value`.
-
     **Arguments**
 
     - `value`: the value to concatenate to the node attributes.
diff --git a/spektral/transforms/degree.py b/spektral/transforms/degree.py
index 37f357e6..b150d8f0 100644
--- a/spektral/transforms/degree.py
+++ b/spektral/transforms/degree.py
@@ -8,9 +8,6 @@ class Degree(object):
     Concatenates to each node attribute the one-hot degree of the corresponding
     node.
 
-    If the graph doesn't have node attributes, then they are created and set to
-    the degree.
-
     The adjacency matrix is expected to have integer entries and the degree is
     cast to integer before one-hot encoding.
 
diff --git a/spektral/transforms/delaunay.py b/spektral/transforms/delaunay.py
new file mode 100644
index 00000000..2e423b59
--- /dev/null
+++ b/spektral/transforms/delaunay.py
@@ -0,0 +1,20 @@
+from scipy.spatial import Delaunay as DelaunaySP
+import numpy as np
+import scipy.sparse as sp
+
+
+class Delaunay:
+    def __call__(self, graph):
+        if 'x' not in graph:
+            raise ValueError('The graph must have node features')
+        if graph.n_node_features != 2:
+            raise ValueError('Can only compute triangulation for 2-d points.')
+        tri = DelaunaySP(graph.x)
+        edges = np.concatenate((tri.vertices[:, :2],
+                                tri.vertices[:, 1:],
+                                tri.vertices[:, ::2]), axis=0)
+        values = np.ones(edges.shape[0])
+        graph.a = sp.csr_matrix((values, edges.T))
+        graph.a.data = np.clip(graph.a.data, 0, 1)
+
+        return graph
diff --git a/spektral/transforms/gcn_filter.py b/spektral/transforms/gcn_filter.py
index 9985e108..aa5e3294 100644
--- a/spektral/transforms/gcn_filter.py
+++ b/spektral/transforms/gcn_filter.py
@@ -5,8 +5,7 @@ class GCNFilter(object):
     r"""
     Normalizes the adjacency matrix as described by
     [Kipf & Welling (2017)](https://arxiv.org/abs/1609.02907):
-
-    $$
+$$
     \A \leftarrow \hat\D^{-\frac{1}{2}} (\A + \I) \hat\D^{-\frac{1}{2}}
     $$
 
diff --git a/spektral/transforms/normalize_adj.py b/spektral/transforms/normalize_adj.py
index da105cdf..6dea6c2d 100644
--- a/spektral/transforms/normalize_adj.py
+++ b/spektral/transforms/normalize_adj.py
@@ -4,8 +4,7 @@
 class NormalizeAdj(object):
     r"""
     Normalizes the adjacency matrix as:
-
-    $$
+$$
     \A \leftarrow \D^{-1/2}\A\D^{-1/2}
     $$
 
diff --git a/spektral/transforms/normalize_one.py b/spektral/transforms/normalize_one.py
index 5b106ee8..b401ae63 100644
--- a/spektral/transforms/normalize_one.py
+++ b/spektral/transforms/normalize_one.py
@@ -5,7 +5,6 @@ class NormalizeOne:
     r"""
     Normalizes the node attributes by dividing each row by its sum, so that it
     sums to 1:
-
     $$
         \X_i \leftarrow \frac{\X_i}{\sum_{j=1}^{N} \X_{ij}}
     $$
diff --git a/spektral/transforms/normalize_sphere.py b/spektral/transforms/normalize_sphere.py
index 646b34a2..9616fabb 100644
--- a/spektral/transforms/normalize_sphere.py
+++ b/spektral/transforms/normalize_sphere.py
@@ -5,8 +5,7 @@ class NormalizeSphere:
     r"""
     Normalizes the node attributes so that they are centered at the origin and
     contained within a sphere of radius 1:
-
-    $$
+$$
         \X_{i} \leftarrow \frac{\X_{i} - \bar\X}{\max_{i,j} \X_{ij}}
     $$
 
diff --git a/spektral/utils/io.py b/spektral/utils/io.py
index fa5bfedb..740de6bb 100644
--- a/spektral/utils/io.py
+++ b/spektral/utils/io.py
@@ -5,6 +5,7 @@
 import networkx as nx
 import numpy as np
 import pandas as pd
+import scipy.sparse as sp
 
 
 def load_binary(filename):
@@ -144,6 +145,48 @@ def dump_txt(obj, filename, **kwargs):
     np.savetxt(filename, obj, **kwargs)
 
 
+def _parse_off(lines):
+    n_verts, n_faces, _ = map(int, lines[0].split(' '))
+
+    # Read vertices
+    verts = np.array([l.split(' ') for l in lines[1:n_verts + 1]]).astype(float)
+
+    # Read faces
+    faces = lines[n_verts + 1:n_verts + 1 + n_faces]
+    faces = [list(map(int, f.split(' '))) for f in faces]
+    triangles = np.array(list(filter(lambda f: len(f) == 4, faces))).astype(int)
+    rectangles = np.array(list(filter(lambda f: len(f) == 5, faces))).astype(int)
+    if len(rectangles) > 0:
+        tri_a = rectangles[:, [1, 2, 3]]
+        tri_b = rectangles[:, [1, 2, 4]]
+        triangles = np.vstack((triangles, tri_a, tri_b))
+    triangles = triangles[:, 1:]
+    triangles = triangles[triangles[:, 0].argsort()]
+
+    return verts, triangles
+
+
+def load_off(filename):
+    """
+    Reads an .off file into a Graph object. Node attributes are the 3d
+    coordinates of the points, all faces are converted to edges.
+    :param filename: path to the .off file.
+    :return: a Graph
+    """
+    from spektral.data.graph import Graph
+
+    lines = open(filename, 'r').read().lstrip('OF\n').splitlines()
+    x, faces = _parse_off(lines)
+    n = x.shape[0]
+    row, col = np.vstack((faces[:, :2], faces[:, 1:], faces[:, ::2])).T
+    adj = sp.csr_matrix((np.ones_like(row), (row, col)), shape=(n, n)).tolil()
+    adj[col, row] = adj[row, col]
+    adj = adj.T.tocsr()
+    adj.data = np.clip(adj.data, 0, 1)
+
+    return Graph(x=x, adj=adj)
+
+
 # Reference for implementation:
 # # http://www.nonlinear.com/progenesis/sdf-studio/v0.9/faq/sdf-file-format-guidance.aspx
 #
@@ -205,11 +248,6 @@ def dump_txt(obj, filename, **kwargs):
 
 
 def _get_atomic_num(symbol):
-    """
-    Given an atomic symbol (e.g., 'C'), returns its atomic number (e.g., 6)
-    :param symbol: string, atomic symbol
-    :return: int <= 118
-    """
     return SYMBOL_TO_NUM[symbol.lower().capitalize()]
 
 
diff --git a/tests/test_transforms/test_transforms.py b/tests/test_transforms/test_transforms.py
index de2b519b..819f93ba 100644
--- a/tests/test_transforms/test_transforms.py
+++ b/tests/test_transforms/test_transforms.py
@@ -2,9 +2,7 @@
 import scipy.sparse as sp
 
 from spektral.data import Graph
-from spektral.transforms import (AdjToSpTensor, Constant, Degree, GCNFilter,
-                                 LayerPreprocess, NormalizeAdj, NormalizeOne,
-                                 NormalizeSphere, OneHotLabels)
+from spektral import transforms as tr
 
 N = 10
 F = 3
@@ -24,14 +22,23 @@
 
 
 def test_adj_to_sp_tensor():
-    t = AdjToSpTensor()
+    t = tr.AdjToSpTensor()
     g = Graph(x=x, a=a, e=e, y=y_gl)
     assert callable(t)
     t(g)
 
 
+def test_clustering_coeff():
+    t = tr.ClusteringCoeff()
+    assert callable(t)
+    g = Graph(x=x, a=a, e=e, y=y_gl)
+    t(g)
+    g = Graph(x=None, a=a, e=e, y=y_gl)
+    t(g)
+
+
 def test_constant():
-    t = Constant(10)
+    t = tr.Constant(10)
     assert callable(t)
     g = Graph(x=x, a=a, e=e, y=y_gl)
     t(g)
@@ -40,7 +47,7 @@ def test_constant():
 
 
 def test_degree():
-    t = Degree(10)
+    t = tr.Degree(10)
     assert callable(t)
     g = Graph(x=x, a=a, e=e, y=y_gl)
     t(g)
@@ -48,8 +55,18 @@ def test_degree():
     t(g)
 
 
+def test_delaunay():
+    t = tr.Delaunay()
+    assert callable(t)
+    x = np.random.rand(N, 2)
+    g = Graph(x=x, a=a, e=e, y=y_nl)
+    t(g)
+    g = Graph(x=x, a=a.A, e=e, y=y_nl)
+    t(g)
+
+
 def test_gcn_filter():
-    t = GCNFilter()
+    t = tr.GCNFilter()
     assert callable(t)
     g = Graph(x=x, a=a, e=e, y=y_nl)
     t(g)
@@ -59,14 +76,14 @@ def test_gcn_filter():
 
 def test_layer_preprocess():
     from spektral.layers import GCNConv
-    t = LayerPreprocess(GCNConv)
+    t = tr.LayerPreprocess(GCNConv)
     assert callable(t)
     g = Graph(x=x, a=a, e=e, y=y_nl)
     t(g)
 
 
 def test_normalize_adj():
-    t = NormalizeAdj
+    t = tr.NormalizeAdj()
     assert callable(t)
     g = Graph(x=x, a=a, e=e, y=y_nl)
     t(g)
@@ -75,21 +92,21 @@ def test_normalize_adj():
 
 
 def test_normalize_one():
-    t = NormalizeOne()
+    t = tr.NormalizeOne()
     assert callable(t)
     g = Graph(x=x, a=a, e=e, y=y_gl)
     t(g)
 
 
 def test_normalize_sphere():
-    t = NormalizeSphere()
+    t = tr.NormalizeSphere()
     assert callable(t)
     g = Graph(x=x, a=a, e=e, y=y_gl)
     t(g)
 
 
 def test_one_hot():
-    t = OneHotLabels(depth=2)
+    t = tr.OneHotLabels(depth=2)
     assert callable(t)
     g = Graph(x=x, a=a, e=e, y=y_gl)
     t(g)

From 2e1ef7c7c735a1f20bf728ea73fcbf0fa92a7f0e Mon Sep 17 00:00:00 2001
From: Daniele Grattarola <daniele.grattarola@gmail.com>
Date: Thu, 26 Nov 2020 17:57:42 +0100
Subject: [PATCH 45/57] Add QM7 dataset Make Loaders work even if soe
 attributes are not passed

---
 spektral/data/loaders.py        | 78 +++++++++++++++++++++++++--------
 spektral/datasets/__init__.py   |  3 +-
 spektral/datasets/qm7.py        | 49 +++++++++++++++++++++
 tests/test_data/test_loaders.py |  2 +-
 tests/test_datasets.py          | 27 ++++++++----
 5 files changed, 130 insertions(+), 29 deletions(-)
 create mode 100644 spektral/datasets/qm7.py

diff --git a/spektral/data/loaders.py b/spektral/data/loaders.py
index 65fce8bd..539a4341 100644
--- a/spektral/data/loaders.py
+++ b/spektral/data/loaders.py
@@ -34,12 +34,12 @@ def collate(self, batch):
 
     Since all data matrices (node attributes, adjacency matrices, etc.)
     are usually collated together, the two list comprehensions of the example
-    above can be computed all at once by using the private `_pack()` method
+    above can be computed all at once by using the private `pack()` method
     of the Loader class:
 
     ```python
     def collate(self, batch):
-        x, a = self._pack(batch)
+        x, a = self.pack(batch)
         return np.array(x), np.array(a)
     ```
 
@@ -105,8 +105,40 @@ def tf_signature(self):
         signature = self.dataset.signature
         return to_tf_signature(signature)
 
-    def _pack(self, batch):
-        return [list(elem) for elem in zip(*[g.numpy() for g in batch])]
+    def pack(self, batch, return_dict=False):
+        """
+        Given a batch of graphs, groups their attributes into separate lists.
+
+        For instance, if a batch has three graphs g1, g2 and g3 with node
+        features (x1, x2, x3) and adjacency matrices (a1, a2, a3), this method
+        will return:
+
+        ```
+        a_list = [a1, a2, a3]
+        x_list = [x1, x2, x3]
+        ```
+
+        If `return_dict=True`, the lists are wrapped in a dictionary:
+
+        ```
+        {'a_list': [a1, a2, a3],
+         'x_list': [x1, x2, x3]}
+        ```
+
+         this is useful for passing the packed batch to `data.utils.to_batch()`
+         and `data.utils.to_disjoint()` without knowing a-priori what are the
+         attributes of the graphs.
+
+        :param batch: a list of Graphs
+        :param return_dict: whether to return the lists as element of a dictionary.
+        :return: the batch packed into lists, by attribute type.
+        """
+        output = [list(elem) for elem in zip(*[g.numpy() for g in batch])]
+        if return_dict:
+            keys = [k + '_list' for k in self.dataset.signature.keys()]
+            return {k: v for k, v in zip(keys, output)}
+        else:
+            return output
 
     @property
     def steps_per_epoch(self):
@@ -233,21 +265,23 @@ def __init__(self, dataset, node_level=False, batch_size=1, epochs=None,
                                              epochs=epochs, shuffle=shuffle)
 
     def collate(self, batch):
-        packed = self._pack(batch)
-        if self.node_level:
-            y = np.vstack(packed[-1])
-        else:
-            y = np.array(packed[-1])
-        output = to_disjoint(*packed[:-1])
+        packed = self.pack(batch, return_dict=True)
+        y = None
+        if 'y' in self.dataset.signature:
+            y = packed.pop('y_list')
+            y = np.vstack(y) if self.node_level else np.array(y)
 
-        # Sparse matrices to SparseTensors
+        output = to_disjoint(**packed)
         output = list(output)
         for i in range(len(output)):
             if sp.issparse(output[i]):
                 output[i] = sp_matrix_to_sp_tensor(output[i])
         output = tuple(output)
 
-        return output, y
+        if y is None:
+            return output
+        else:
+            return output, y
 
     def load(self):
         if not tf_loader_available:
@@ -322,13 +356,17 @@ class BatchLoader(Loader):
 
     """
     def collate(self, batch):
-        packed = self._pack(batch)
-        y = np.array(packed[-1])
-        output = to_batch(*packed[:-1])
+        packed = self.pack(batch, return_dict=True)
+        y = np.array(packed.pop('y_list')) if 'y' in self.dataset.signature else None
+
+        output = to_batch(**packed)
         if len(output) == 1:
             output = output[0]
 
-        return output, y
+        if y is None:
+            return output
+        else:
+            return output, y
 
     def tf_signature(self):
         signature = self.dataset.signature
@@ -365,9 +403,13 @@ class PackedBatchLoader(BatchLoader):
     """
     def __init__(self, dataset, batch_size=1, epochs=None, shuffle=True):
         super().__init__(dataset, batch_size=batch_size, epochs=epochs, shuffle=shuffle)
+
         # Drop the Dataset container and work on packed tensors directly
-        self.dataset = self._pack(self.dataset)
-        self.dataset = to_batch(*self.dataset[:-1]) + (np.array(self.dataset[-1]), )
+        packed = self.pack(self.dataset, return_dict=True)
+        y = np.array(packed.pop('y_list')) if 'y' in dataset.signature else None
+        self.dataset = to_batch(**packed)
+        if y is not None:
+            self.dataset += (y, )
 
         # Re-instantiate generator after packing dataset
         self._generator = self.generator()
diff --git a/spektral/datasets/__init__.py b/spektral/datasets/__init__.py
index d6419daf..c42f7e82 100644
--- a/spektral/datasets/__init__.py
+++ b/spektral/datasets/__init__.py
@@ -1,7 +1,8 @@
-from .citation import Citation
+from .citation import Citation, Cora, Citeseer, Pubmed
 from .graphsage import GraphSage, PPI, Reddit
 from .mnist import MNIST
 from .modelnet import ModelNet
 from .ogb import OGB
+from .qm7 import QM7
 from .qm9 import QM9
 from .tudataset import TUDataset
diff --git a/spektral/datasets/qm7.py b/spektral/datasets/qm7.py
new file mode 100644
index 00000000..0b226669
--- /dev/null
+++ b/spektral/datasets/qm7.py
@@ -0,0 +1,49 @@
+import os.path as osp
+
+import numpy as np
+import scipy.sparse as sp
+from scipy.io import loadmat
+from tensorflow.keras.utils import get_file
+
+from spektral.data import Dataset, Graph
+
+
+class QM7(Dataset):
+    """
+    The QM7b dataset of molecules from the paper:
+
+    > [MoleculeNet: A Benchmark for Molecular Machine Learning](https://arxiv.org/abs/1703.00564)<br>
+    > Zhenqin Wu et al.
+
+    The dataset has no node features.
+    Edges and edge features are obtained from the Coulomb matrices of the
+    molecules.
+
+    Each graph has a 14-dimensional label for regression.
+    """
+    url = 'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/qm7b.mat'
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    def download(self):
+        get_file('qm7b.mat', self.url, extract=True, cache_dir=self.path,
+                 cache_subdir=self.path)
+
+    def read(self):
+        print('Loading QM7 dataset.')
+        mat_file = osp.join(self.path, 'qm7b.mat')
+        data = loadmat(mat_file)
+
+        coulomb_matrices = data['X']
+        labels = data['T']
+
+        output = []
+        for i in range(len(coulomb_matrices)):
+            row, col, data = sp.find(coulomb_matrices[i])
+            a = sp.csr_matrix((np.ones_like(data), (row, col)))
+            e = data[:, None]
+            y = labels[i]
+            output.append(Graph(a=a, e=e, y=y))
+
+        return output
diff --git a/tests/test_data/test_loaders.py b/tests/test_data/test_loaders.py
index b6ae16cd..86d2dfcb 100644
--- a/tests/test_data/test_loaders.py
+++ b/tests/test_data/test_loaders.py
@@ -109,7 +109,7 @@ def test_batch():
     assert y.shape == (graphs_in_batch, 2)
 
 
-def test_fast_batch():
+def test_packed_batch():
     data = TestDataset()
     loader = PackedBatchLoader(data, batch_size=batch_size, epochs=1, shuffle=False)
     batches = [b for b in loader]
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index 7e49bb46..b1098b03 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -1,28 +1,37 @@
 from spektral.data import DisjointLoader, BatchLoader, SingleLoader
-from spektral.datasets import qm9, citation, graphsage, mnist, tudataset
+from spektral import datasets
 
 batch_size = 3
 
 
 def test_citation():
-    dataset = citation.Cora()
-    dataset = citation.Citeseer(random_split=True)
-    dataset = citation.Pubmed(normalize_x=True)
+    dataset = datasets.Cora()
+    dataset = datasets.Citeseer(random_split=True)
+    dataset = datasets.Pubmed(normalize_x=True)
     sl = SingleLoader(dataset)
 
 
 def test_graphsage():
     # Test only PPI because Travis otherwise runs into memory errors
-    dataset = graphsage.PPI()
+    dataset = datasets.PPI()
     sl = SingleLoader(dataset)
 
 
 def test_mnist():
-    dataset = mnist.MNIST(k=8, noise_level=0.1)
+    dataset = datasets.MNIST(k=8, noise_level=0.1)
+
+
+def test_qm7():
+    dataset = datasets.QM7()
+    dl = DisjointLoader(dataset, batch_size=batch_size)
+    dl.__next__()
+
+    bl = BatchLoader(dataset, batch_size=batch_size)
+    bl.__next__()
 
 
 def test_qm9():
-    dataset = qm9.QM9(amount=100)
+    dataset = datasets.QM9(amount=100)
     dl = DisjointLoader(dataset, batch_size=batch_size)
     dl.__next__()
 
@@ -32,7 +41,7 @@ def test_qm9():
 
 def test_tud():
     # Edge labels + edge attributes
-    dataset = tudataset.TUDataset('BZR_MD', clean=False)
+    dataset = datasets.TUDataset('BZR_MD', clean=False)
     dl = DisjointLoader(dataset, batch_size=batch_size)
     dl.__next__()
 
@@ -40,7 +49,7 @@ def test_tud():
     bl.__next__()
 
     # Node labels + node attributes + clean version
-    dataset = tudataset.TUDataset('ENZYMES', clean=True)
+    dataset = datasets.TUDataset('ENZYMES', clean=True)
     dl = DisjointLoader(dataset, batch_size=batch_size)
     dl.__next__()
 

From 742999b114428b5a92c329a9bd83e7b5b09ca1c3 Mon Sep 17 00:00:00 2001
From: Daniele Grattarola <daniele.grattarola@gmail.com>
Date: Thu, 26 Nov 2020 18:06:10 +0100
Subject: [PATCH 46/57] Move GeneralGNN example to examples

---
 examples/graph_prediction/general_gnn.py | 78 ++++++++++++++++++++++++
 spektral/models/general_gnn.py           | 65 +-------------------
 2 files changed, 80 insertions(+), 63 deletions(-)
 create mode 100644 examples/graph_prediction/general_gnn.py

diff --git a/examples/graph_prediction/general_gnn.py b/examples/graph_prediction/general_gnn.py
new file mode 100644
index 00000000..aae3e7fa
--- /dev/null
+++ b/examples/graph_prediction/general_gnn.py
@@ -0,0 +1,78 @@
+"""
+This example implements an experiment from the paper
+
+    > [Design Space for Graph Neural Networks](https://arxiv.org/abs/2011.08843)<br>
+    > Jiaxuan You, Rex Ying, Jure Leskovec
+
+using the PROTEINS dataset.
+
+The configuration at the top of the file is the best one identified in the
+paper, and should work well for many different datasets without changes.
+
+Note: the results reported in the paper are averaged over 3 random repetitions
+with an 80/20 split.
+"""
+import tensorflow as tf
+import numpy as np
+from spektral.models import GeneralGNN
+
+from spektral.data import DisjointLoader
+
+from spektral.datasets import TUDataset
+from tensorflow.keras.optimizers import Adam
+
+physical_devices = tf.config.list_physical_devices('GPU')
+tf.config.experimental.set_memory_growth(physical_devices[0], True)
+
+# Best config
+batch_size = 32
+learning_rate = 0.01
+epochs = 400
+
+# Read data
+data = TUDataset('PROTEINS')
+
+# Train/test split
+np.random.shuffle(data)
+split = int(0.8 * len(data))
+data_tr, data_te = data[:split], data[split:]
+
+# Data loader
+loader_tr = DisjointLoader(data_tr, batch_size=batch_size, epochs=epochs)
+loader_te = DisjointLoader(data_te, batch_size=batch_size)
+
+# Create model
+model = GeneralGNN(data.n_labels, activation='softmax')
+optimizer = Adam(learning_rate)
+model.compile('adam', 'categorical_crossentropy', metrics=['categorical_accuracy'])
+
+
+# Evaluation function
+def evaluate(loader):
+    step = 0
+    results = []
+    for batch in loader:
+        step += 1
+        loss, acc = model.test_on_batch(*batch)
+        results.append((loss, acc))
+        if step == loader.steps_per_epoch:
+            return np.mean(results, 0)
+
+
+# Training loop
+epoch = step = 0
+results = []
+for batch in loader_tr:
+    step += 1
+    loss, acc = model.train_on_batch(*batch)
+    results.append((loss, acc))
+    if step == loader_tr.steps_per_epoch:
+        step = 0
+        epoch += 1
+        results_te = evaluate(loader_te)
+        print('Epoch {} - Train loss: {:.3f} - Train acc: {:.3f} - '
+              'Test loss: {:.3f} - Test acc: {:.3f}'
+              .format(epoch, *np.mean(results, 0), *results_te))
+
+results_te = evaluate(loader_te)
+print('Final results - Loss: {:.3f} - Acc: {:.3f}'.format(*results_te))
diff --git a/spektral/models/general_gnn.py b/spektral/models/general_gnn.py
index 0762c976..a118ddc3 100644
--- a/spektral/models/general_gnn.py
+++ b/spektral/models/general_gnn.py
@@ -2,8 +2,6 @@
 from tensorflow.keras.layers import Dense, Concatenate, Add, BatchNormalization, Dropout, Activation
 from tensorflow.keras.layers import PReLU
 
-from spektral.data import DisjointLoader
-from spektral.datasets import TUDataset
 from spektral.layers import GeneralConv
 from spektral.layers.pooling import global_pool
 
@@ -46,13 +44,13 @@ class GeneralGNN(Model):
 
     The dense layers of the pre-processing and post-processing MLPs compute the
     following update of the node features:
-$$
+    $$
         \h_i = \mathrm{Act} \left( \mathrm{Dropout} \left( \mathrm{BN}
         \left( \x_i \W + \b \right) \right) \right)
     $$
 
     Message-passing layers compute:
-$$
+    $$
         \h_i = \mathrm{Agg} \left( \left\{ \mathrm{Act} \left( \mathrm{Dropout}
         \left( \mathrm{BN} \left( \x_j \W + \b \right) \right) \right),
         j \in \mathcal{N}(i) \right\} \right)
@@ -154,62 +152,3 @@ def __init__(self, output, hidden=256, layers=2, batch_norm=True,
 
     def call(self, inputs):
         return self.mlp(inputs)
-
-
-if __name__ == '__main__':
-    import tensorflow as tf
-    import numpy as np
-    from tensorflow.keras.optimizers import Adam
-    physical_devices = tf.config.list_physical_devices('GPU')
-    tf.config.experimental.set_memory_growth(physical_devices[0], True)
-
-    # Best config
-    batch_size = 32
-    learning_rate = 0.01
-    epochs = 400
-
-    # Read data
-    data = TUDataset('PROTEINS')
-
-    # Train/test split
-    np.random.shuffle(data)
-    split = int(0.8 * len(data))
-    data_tr, data_te = data[:split], data[split:]
-
-    # Data loader
-    loader_tr = DisjointLoader(data_tr, batch_size=batch_size, epochs=epochs)
-    loader_te = DisjointLoader(data_te, batch_size=batch_size)
-
-    # Create model
-    model = GeneralGNN(data.n_labels, activation='softmax')
-    optimizer = Adam(learning_rate)
-    model.compile('adam', 'categorical_crossentropy', metrics=['categorical_accuracy'])
-
-    # Evaluation function
-    def eval(loader):
-        step = 0
-        results = []
-        for batch in loader:
-            step += 1
-            l, a = model.test_on_batch(*batch)
-            results.append((l, a))
-            if step == loader.steps_per_epoch:
-                return np.mean(results, 0)
-
-    # Training loop
-    epoch = step = 0
-    results = []
-    for batch in loader_tr:
-        step += 1
-        l, a = model.train_on_batch(*batch)
-        results.append((l, a))
-        if step == loader_tr.steps_per_epoch:
-            step = 0
-            epoch += 1
-            results_te = eval(loader_te)
-            print('Epoch {} - Train loss: {:.3f} - Train acc: {:.3f} - '
-                  'Test loss: {:.3f} - Test acc: {:.3f}'
-                  .format(epoch, *np.mean(results, 0), *results_te))
-
-    results_te = eval(loader_te)
-    print('Final results - Loss: {:.3f} - Acc: {:.3f}'.format(*results_te))

From 0d4988251319115d1b02040ab8a73b5227218c44 Mon Sep 17 00:00:00 2001
From: Daniele Grattarola <daniele.grattarola@gmail.com>
Date: Thu, 26 Nov 2020 18:10:09 +0100
Subject: [PATCH 47/57] QM9 docs

---
 spektral/datasets/qm9.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/spektral/datasets/qm9.py b/spektral/datasets/qm9.py
index 5387dcd7..e3183af5 100644
--- a/spektral/datasets/qm9.py
+++ b/spektral/datasets/qm9.py
@@ -28,12 +28,10 @@ class QM9(Dataset):
     - The atomic charge;
     - The mass difference from the monoisotope;
 
-    See [this link](http://www.nonlinear.com/progenesis/sdf-studio/v0.9/faq/sdf-file-format-guidance.aspx)
-    for more information.
+    The edge features represent the type of chemical bond between two atoms,
+    one-hot encoded.
 
-    The edge features represent the type of chemical bond between two atoms.
-
-    Labels represent... TODO
+    Each graph has an 18-dimensional label for regression.
 
     **Arguments**
 

From a5f185bff97b7130d16d2770d1e7c4d28899f91d Mon Sep 17 00:00:00 2001
From: Daniele Grattarola <daniele.grattarola@gmail.com>
Date: Thu, 26 Nov 2020 18:12:37 +0100
Subject: [PATCH 48/57] Fix issue in general_gnn.py

---
 examples/graph_prediction/general_gnn.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/graph_prediction/general_gnn.py b/examples/graph_prediction/general_gnn.py
index aae3e7fa..6f1ba8cb 100644
--- a/examples/graph_prediction/general_gnn.py
+++ b/examples/graph_prediction/general_gnn.py
@@ -22,7 +22,8 @@
 from tensorflow.keras.optimizers import Adam
 
 physical_devices = tf.config.list_physical_devices('GPU')
-tf.config.experimental.set_memory_growth(physical_devices[0], True)
+if len(physical_devices) > 0:
+    tf.config.experimental.set_memory_growth(physical_devices[0], True)
 
 # Best config
 batch_size = 32

From e8072be5559050c25882e24abd4c2540ccc20dd4 Mon Sep 17 00:00:00 2001
From: Daniele Grattarola <daniele.grattarola@gmail.com>
Date: Thu, 26 Nov 2020 19:20:13 +0100
Subject: [PATCH 49/57] Update docs

---
 README.md                       | 42 +++++++++++++-----------
 docs/autogen.py                 |  1 +
 spektral/data/loaders.py        | 57 +++++++++++++++++++--------------
 spektral/transforms/delaunay.py |  9 ++++++
 4 files changed, 67 insertions(+), 42 deletions(-)

diff --git a/README.md b/README.md
index 42e73f60..d8ee50df 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,7 @@ You can use Spektral for classifying the userss of a social network, predicting
 Spektral implements some of the most popular layers for graph deep learning, including: 
 
 - [Graph Convolutional Networks (GCN)](https://arxiv.org/abs/1609.02907)
-- [Chebyshev networks (ChebNets)](https://arxiv.org/abs/1606.09375)
+- [Chebyshev convolutions](https://arxiv.org/abs/1606.09375)
 - [GraphSAGE](https://arxiv.org/abs/1706.02216)
 - [ARMA convolutions](https://arxiv.org/abs/1901.01343)
 - [Edge-Conditioned Convolutions (ECC)](https://arxiv.org/abs/1704.02901)
@@ -22,15 +22,15 @@ and many others (see [convolutional layers](https://graphneural.network/layers/c
 
 You can also find [pooling layers](https://graphneural.network/layers/pooling/), including:
 
+- [MinCut pooling](https://arxiv.org/abs/1907.00481)
 - [DiffPool](https://arxiv.org/abs/1806.08804)
-- [MinCUT pooling](https://arxiv.org/abs/1907.00481)
 - [Top-K pooling](http://proceedings.mlr.press/v97/gao19a/gao19a.pdf)
 - [Self-Attention Graph (SAG) pooling](https://arxiv.org/abs/1904.08082)
-- Global sum, average, and max pooling
+- Global pooling
 - [Global gated attention pooling](https://arxiv.org/abs/1511.05493)
 - [SortPool](https://www.cse.wustl.edu/~muhan/papers/AAAI_2018_DGCNN.pdf)
 
-Spektral also includes lots of utilities for your graph deep learning projects.  
+Spektral also includes lots of utilities for for representing, manipulating, and transforming graphs in your graph deep learning projects.
 
 See how to [get started with Spektral](https://graphneural.network/getting-started/) and have a look at the [examples](https://danielegrattarola.github.io/spektral/examples/) for some templates.
 
@@ -39,10 +39,8 @@ Read the documentation [here](https://graphneural.network).
 
 If you want to cite Spektral in your work, refer to our paper: 
 
-> Graph Neural Networks in TensorFlow and Keras with Spektral  
-> D. Grattarola and C. Alippi  
-> ICML 2020 - GRL+ Workshop  
-> [https://arxiv.org/abs/2006.12138](https://arxiv.org/abs/2006.12138)  
+> [Graph Neural Networks in TensorFlow and Keras with Spektral](https://arxiv.org/abs/2006.12138)<br>
+> Daniele Grattarola and Cesare Alippi
 
 ## Installation
 Spektral is compatible with Python 3.5+, and is tested on Ubuntu 16.04+ and MacOS. 
@@ -68,18 +66,26 @@ To install Spektral on [Google Colab](https://colab.research.google.com/):
 ! pip install spektral
 ```
 
-## TensorFlow 1 and Keras
-Starting from version 0.3, Spektral only supports TensorFlow 2 and `tf.keras`.
-The old version of Spektral, which is based on TensorFlow 1 and the stand-alone Keras library, is still available on the `tf1` branch on GitHub and can be installed from source:
+## New in Spektral 1.0
+
+The 1.0 release of Spektral is an important milestone for the library and brings many new features and improvements. 
+
+If you have already used Spektral in your projects, the only major change that you need to be aware of is in the `datasets` API.
+Your models will continue to work in exactly the same way.
+
+This is a summary of the new features and changes: 
+
+- The new `Graph` and `Dataset` containers standardize the way in which Spektral handles data. 
+**This does not impact your models**, but makes it easier to use your own data in Spektral.
+- The new `Loader` class hides away all the complexity of creating graph batches. 
+Whether you want to write your own training loop or use Keras' famous model-dot-fit approach, you only need to worry about the training logic and not the data. 
+- The new `transforms` module implements a wide variety of common operations on graphs, that you can now `apply()` to your datasets. 
+- The new `GeneralConv` and `GeneralGNN` classes let you build models that are, well... general. Using state-of-the-art results from recent literature means that you don't need to worry about which layers or architecture to choose. The defaults will work well everywhere. 
+- New datasets: QM7 and ModelNet10/40, and a new wrapper for OGB datasets. 
+- Major clean-up of the library's structure and dependencies.
+- New examples and tutorials.
 
-```bash
-git clone https://github.com/danielegrattarola/spektral.git
-cd spektral
-git checkout tf1
-python setup.py install  # Or 'pip install .'
-```
 
-In the future, the TF1-compatible version of Spektral (<0.3) will receive bug fixes, but all new features will only support TensorFlow 2.   
 
 ## Contributing
 Spektral is an open source project available [on Github](https://github.com/danielegrattarola/spektral), and contributions of all types are welcome. 
diff --git a/docs/autogen.py b/docs/autogen.py
index fede9715..1247fd8e 100644
--- a/docs/autogen.py
+++ b/docs/autogen.py
@@ -144,6 +144,7 @@
             datasets.MNIST,
             datasets.ModelNet,
             datasets.OGB,
+            datasets.QM7,
             datasets.QM9,
             datasets.TUDataset,
         ]
diff --git a/spektral/data/loaders.py b/spektral/data/loaders.py
index 539a4341..f0f1b941 100644
--- a/spektral/data/loaders.py
+++ b/spektral/data/loaders.py
@@ -34,8 +34,7 @@ def collate(self, batch):
 
     Since all data matrices (node attributes, adjacency matrices, etc.)
     are usually collated together, the two list comprehensions of the example
-    above can be computed all at once by using the private `pack()` method
-    of the Loader class:
+    above can be computed all at once by using the `pack()` method:
 
     ```python
     def collate(self, batch):
@@ -43,31 +42,41 @@ def collate(self, batch):
         return np.array(x), np.array(a)
     ```
 
-    Additionally, a Loader should implement two main methods that simplify its
-    usage within the TensorFlow/Keras training pipeline:
+    The `load()` method of a Loader returns an object that can be given as
+    input to `Model.fit()`.
 
-    - `load()`: should return a `tf.data` dataset, a generator, or a
-    `keras.utils.Sequence`. Its usage pattern should be as follows:
+    You can use it as follows:
 
-    `model.fit(loader.load(), steps_per_epoch=loader.steps_per_epoch)`
+    ```python
+    model.fit(loader.load(), steps_per_epoch=loader.steps_per_epoch)
+    ```
+
+    The `steps_per_epoch` property represents the number of batches that are in
+    an epoch, and is a required keyword when calling `model.fit()` with a Loader.
 
-    The `steps_per_epoch` property returns the number of batches
-    (as specified by the `batch_size` argument) that are in an epoch and is
-    automatically computed from the data.
 
-    By default, `load()` will simply return a `tf.data.Dataset.from_generator`
-    dataset obtained from the Loader itself (since Loaders are also Python
-    generators).
+    If you want to write your own training function, you can use the
+    `tf_signature()` method to specify the signature of your batches using the
+    tf.TypeSpec system, in order to avoid unnecessary re-tracings.
 
-    - `tf_signature()`: this method should return the Tensorflow signature of
-    the batches computed by `collate(batch)`, using the `tf.TypeSpec` system.
-    All Datasets have a `signature` property that can be used to compute the
-    TensorFlow signature (which represents the shape, dtype and TypeSpec of a
-    each data matrix in a generic graph) with the
-    `spektral.data.utils.to_tf_signature(signature)` function.
+    For example, a simple training function can be written as:
 
-    By default, `tf_signature()` will simply return the Dataset's signature
-    converted to the TensorFlow format.
+    ```python
+    @tf.function(input_signature=loader.tf_signature(), experimental_relax_shapes=True)
+    def train_step(inputs, target):
+        with tf.GradientTape() as tape:
+            predictions = model(inputs, training=True)
+            loss = loss_fn(target, predictions) + sum(model.losses)
+        gradients = tape.gradient(loss, model.trainable_variables)
+        opt.apply_gradients(zip(gradients, model.trainable_variables))
+    ```
+
+    We can then train our model in a for loop as follows:
+
+    ```python
+    for batch in loader:
+        train_step(*batch)
+    ```
 
     **Arguments**
 
@@ -212,8 +221,8 @@ class DisjointLoader(Loader):
     """
     A Loader for disjoint mode.
 
-    This loader produces batches of graphs as their disjoint union, and supports
-    labels both for graph-level and node-level learning.
+    This loader represents a batch of graphs via their disjoint union, and
+    supports labels both for graph-level and node-level learning.
 
     Because in disjoint mode we need a way to keep track of which nodes belong
     to which graph, the loader will also automatically compute a batch index
@@ -230,7 +239,7 @@ class DisjointLoader(Loader):
     If `node_level=True`, then the labels are stacked vertically (i.e.,
     `(n_nodes, n_labels)`).
 
-    Note that TensorFlow 2.4 or above is required to use this Loader's `load()`
+    **Note:** TensorFlow 2.4 or above is required to use this Loader's `load()`
     method in a Keras training loop.
 
     **Arguments**
diff --git a/spektral/transforms/delaunay.py b/spektral/transforms/delaunay.py
index 2e423b59..2e82ddf4 100644
--- a/spektral/transforms/delaunay.py
+++ b/spektral/transforms/delaunay.py
@@ -4,6 +4,15 @@
 
 
 class Delaunay:
+    """
+    Computes the Delaunay triangulation of the node features.
+
+    The adjacency matrix is obtained from the edges of the triangulation and
+    replaces the previous adjacency matrix.
+    Duplicate edges are ignored and the adjacency matrix is binary.
+
+    Node features must be 2-dimensional.
+    """
     def __call__(self, graph):
         if 'x' not in graph:
             raise ValueError('The graph must have node features')

From fa9dd0f1d2865b6f91780fd10f792d326235d34e Mon Sep 17 00:00:00 2001
From: Daniele Grattarola <daniele.grattarola@gmail.com>
Date: Fri, 27 Nov 2020 12:47:23 +0100
Subject: [PATCH 50/57] Add Conv interface for custom layers as alternative to
 MessagePassing Remove compute_output_shape method from layers Improve
 get_config system for MessagePassing layers

---
 docs/mkdocs.yml                               |  1 +
 docs/templates/examples.md                    | 43 +++++-------
 docs/templates/external.md                    | 18 +++++
 examples/graph_prediction/custom_dataset.py   | 19 +++--
 examples/graph_prediction/general_gnn.py      |  2 +-
 spektral/layers/convolutional/agnn_conv.py    | 13 ++--
 spektral/layers/convolutional/appnp_conv.py   | 46 ++++++------
 spektral/layers/convolutional/arma_conv.py    | 51 ++++++--------
 spektral/layers/convolutional/cheb_conv.py    | 24 +++----
 spektral/layers/convolutional/conv.py         | 55 +++++++++++++++
 spektral/layers/convolutional/crystal_conv.py | 13 ++--
 .../layers/convolutional/diffusion_conv.py    | 40 ++++++-----
 spektral/layers/convolutional/ecc_conv.py     | 20 +++---
 spektral/layers/convolutional/edge_conv.py    | 13 ++--
 spektral/layers/convolutional/gat_conv.py     | 26 +++----
 .../layers/convolutional/gated_graph_conv.py  |  9 ++-
 spektral/layers/convolutional/gcn_conv.py     | 53 +++++---------
 spektral/layers/convolutional/gcs_conv.py     | 14 ++--
 spektral/layers/convolutional/general_conv.py | 17 +++--
 spektral/layers/convolutional/gin_conv.py     | 13 ++--
 .../layers/convolutional/graphsage_conv.py    | 13 ++--
 .../layers/convolutional/message_passing.py   | 70 +++++++++++--------
 spektral/layers/convolutional/tag_conv.py     | 13 ++--
 spektral/layers/ops/scatter.py                | 10 +++
 spektral/utils/keras.py                       |  9 ++-
 tests/test_layers/test_convolutional.py       | 12 +++-
 26 files changed, 333 insertions(+), 284 deletions(-)
 create mode 100644 docs/templates/external.md
 create mode 100644 spektral/layers/convolutional/conv.py

diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml
index 729a6afb..7931ea7b 100644
--- a/docs/mkdocs.yml
+++ b/docs/mkdocs.yml
@@ -48,4 +48,5 @@ nav:
     - Convolution: utils/convolution.md
     - Miscellaneous: utils/misc.md
 - Other:
+    - External resources: external.md
     - About: about.md
diff --git a/docs/templates/examples.md b/docs/templates/examples.md
index 3e309680..2219d895 100644
--- a/docs/templates/examples.md
+++ b/docs/templates/examples.md
@@ -1,34 +1,27 @@
 # Examples
 
-This is a collection of example scripts that you can use as template to solve your own tasks. 
+This is a collection of examples that you can use as template to solve your own tasks. 
 
-## Node classification
-- [Node classification on citation networks with GCN](https://github.com/danielegrattarola/spektral/blob/master/examples/node_prediction/citation_gcn.py);
-- [Node classification on citation networks with ChebNets](https://github.com/danielegrattarola/spektral/blob/master/examples/node_prediction/citation_cheby.py);
-- [Node classification on citation networks with GAT](https://github.com/danielegrattarola/spektral/blob/master/examples/node_prediction/citation_gat.py);
-- [Node classification on citation networks with ARMA](https://github.com/danielegrattarola/spektral/blob/master/examples/node_prediction/citation_arma.py);
-- [Node classification on citation networks with SimpleGCN](https://github.com/danielegrattarola/spektral/blob/master/examples/node_prediction/citation_simple_gc.py);
-- [Node classification on the Open Graph Benchmark dataset (ogbn-proteins)](https://github.com/danielegrattarola/spektral/blob/master/examples/node_prediction/ogbn-proteins_gcn.py);
+## Node-level prediction
 
-## Graph-level prediction
-
-Batch mode:
-
-- [Classification of synthetic graphs with GAT](https://github.com/danielegrattarola/spektral/blob/master/examples/graph_prediction/delaunay_batch.py);
-- [Regression of molecular properties on QM9 with ECC](https://github.com/danielegrattarola/spektral/blob/master/examples/graph_prediction/qm9_batch.py);
+- [Citation networks with GCN](https://github.com/danielegrattarola/spektral/blob/master/examples/node_prediction/citation_gcn.py)
+- [Citation networks with ChebConv](https://github.com/danielegrattarola/spektral/blob/master/examples/node_prediction/citation_cheby.py)
+- [Citation networks with GAT](https://github.com/danielegrattarola/spektral/blob/master/examples/node_prediction/citation_gat.py)
+- [Citation networks with ARMA](https://github.com/danielegrattarola/spektral/blob/master/examples/node_prediction/citation_arma.py)
+- [Citation networks with SimpleGCN (custom transform)](https://github.com/danielegrattarola/spektral/blob/master/examples/node_prediction/citation_simple_gc.py)
+- [Open Graph Benchmark dataset](https://github.com/danielegrattarola/spektral/blob/master/examples/node_prediction/ogbn-proteins_gcn.py)
 
-Disjoint mode: 
-
-- [Classification of synthetic graphs with TopK pooling](https://github.com/danielegrattarola/spektral/blob/master/examples/graph_prediction/BDGC_disjoint.py);
-- [Regression of molecular properties on QM9 with ECC](https://github.com/danielegrattarola/spektral/blob/master/examples/graph_prediction/qm9_disjoint.py);
+## Graph-level prediction
 
-## Graph signal classification
-- [Graph signal classification on MNIST (mixed mode)](https://github.com/danielegrattarola/spektral/blob/master/examples/other/graph_signal_classification_mnist.py);
+- [General GNN](https://github.com/danielegrattarola/spektral/blob/master/examples/graph_prediction/general_gnn.py)
+- [Custom dataset](https://github.com/danielegrattarola/spektral/blob/master/examples/graph_prediction/custom_dataset.py)
+- [OGB mol-esol regression with MinCut pooling](https://github.com/danielegrattarola/spektral/blob/master/examples/graph_prediction/ogbg-mol-esol_batch.py)
+- [OGB mol-hiv classification using edge attributes](https://github.com/danielegrattarola/spektral/blob/master/examples/graph_prediction/ogbg-mol-esol_batch.py)
+- [Regression on QM9 with ECC](https://github.com/danielegrattarola/spektral/blob/master/examples/graph_prediction/qm9_batch.py)
+- [Regression on QM9 with ECC and custom training loop](https://github.com/danielegrattarola/spektral/blob/master/examples/graph_prediction/qm9_disjoint.py)
+- [TUDataset classification with GIN](https://github.com/danielegrattarola/spektral/blob/master/examples/graph_prediction/tud_disjoint.py)
 
 ## Other applications
-- [Node clustering on citation networks with minCUT pooling (unsupervised)](https://github.com/danielegrattarola/spektral/blob/master/examples/other/node_clustering_mincut.py);
-
-The following notebooks are available on Kaggle with more visualizations (maintained by [@kmader](https://github.com/kmader)):
 
-- [MNIST Graph Deep Learning](https://www.kaggle.com/kmader/mnist-graph-deep-learning);
-- [MNIST Graph Pooling](https://www.kaggle.com/kmader/mnist-graph-nn-with-pooling);
+- [Graph signal classification on MNIST (mixed mode)](https://github.com/danielegrattarola/spektral/blob/master/examples/other/graph_signal_classification_mnist.py)
+- [Node clustering on citation networks with minCUT pooling (unsupervised)](https://github.com/danielegrattarola/spektral/blob/master/examples/other/node_clustering_mincut.py)
diff --git a/docs/templates/external.md b/docs/templates/external.md
new file mode 100644
index 00000000..18ed0c5d
--- /dev/null
+++ b/docs/templates/external.md
@@ -0,0 +1,18 @@
+# External resources
+
+This is a collection of additional material about Spektral.
+
+## Paper
+
+We have presented the library at the ICML 2020 workshop "Graph Representation Learning and Beyond".
+
+Paper: 
+
+> [Graph Neural Networks in TensorFlow and Keras with Spektral](https://grlplus.github.io/papers/9.pdf)<br>
+> Daniele Grattarola and Cesare Alippi
+
+## Notebooks 
+The following notebooks are available on Kaggle with more visualizations (maintained by [@kmader](https://github.com/kmader)):
+
+- [MNIST Graph Deep Learning](https://www.kaggle.com/kmader/mnist-graph-deep-learning);
+- [MNIST Graph Pooling](https://www.kaggle.com/kmader/mnist-graph-nn-with-pooling);
diff --git a/examples/graph_prediction/custom_dataset.py b/examples/graph_prediction/custom_dataset.py
index dae18a7c..3cea7ad3 100644
--- a/examples/graph_prediction/custom_dataset.py
+++ b/examples/graph_prediction/custom_dataset.py
@@ -16,8 +16,8 @@
 the corresponding target will be [1, 0].
 """
 
-import networkx as nx
 import numpy as np
+import scipy.sparse as sp
 import tensorflow as tf
 from tensorflow.keras.layers import Input, Dense
 from tensorflow.keras.losses import CategoricalCrossentropy
@@ -33,10 +33,10 @@
 ################################################################################
 # PARAMETERS
 ################################################################################
-learning_rate = 1e-3       # Learning rate
-epochs = 500               # Number of training epochs
+learning_rate = 1e-2       # Learning rate
+epochs = 400               # Number of training epochs
 es_patience = 10           # Patience for early stopping
-batch_size = 16            # Batch size
+batch_size = 32            # Batch size
 
 
 ################################################################################
@@ -50,7 +50,7 @@ class MyDataset(Dataset):
     The graphs have `n_colors` colors, of at least `n_min` and at most `n_max`
     nodes connected with probability `p`.
     """
-    def __init__(self, n_samples, n_colors=3, n_min=10, n_max=100, p=0.5, **kwargs):
+    def __init__(self, n_samples, n_colors=3, n_min=10, n_max=100, p=0.1, **kwargs):
         self.n_samples = n_samples
         self.n_colors = n_colors
         self.n_min = n_min
@@ -68,7 +68,9 @@ def make_graph():
             x[np.arange(n), colors] = 1
 
             # Edges
-            a = nx.adj_matrix(nx.generators.gnp_random_graph(n, self.p))
+            a = np.random.rand(n, n) <= self.p
+            a = np.maximum(a, a.T).astype(int)
+            a = sp.csr_matrix(a)
 
             # Labels
             y = np.zeros((self.n_colors, ))
@@ -149,10 +151,7 @@ def evaluate(loader, ops_list):
 
 
 print('Fitting model')
-current_batch = 0
-epoch = 0
-model_loss = 0
-model_acc = 0
+current_batch = epoch = model_loss = model_acc = 0
 best_val_loss = np.inf
 best_weights = None
 patience = es_patience
diff --git a/examples/graph_prediction/general_gnn.py b/examples/graph_prediction/general_gnn.py
index 6f1ba8cb..593ae91d 100644
--- a/examples/graph_prediction/general_gnn.py
+++ b/examples/graph_prediction/general_gnn.py
@@ -1,5 +1,5 @@
 """
-This example implements an experiment from the paper
+This example implements the model from the paper
 
     > [Design Space for Graph Neural Networks](https://arxiv.org/abs/2011.08843)<br>
     > Jiaxuan You, Rex Ying, Jure Leskovec
diff --git a/spektral/layers/convolutional/agnn_conv.py b/spektral/layers/convolutional/agnn_conv.py
index 9451259b..5cb63129 100644
--- a/spektral/layers/convolutional/agnn_conv.py
+++ b/spektral/layers/convolutional/agnn_conv.py
@@ -47,8 +47,8 @@ class AGNNConv(MessagePassing):
     - `activation`: activation function to use;
     """
 
-    def __init__(self, trainable=True, activation=None, **kwargs):
-        super().__init__(aggregate='sum', activation=activation, **kwargs)
+    def __init__(self, trainable=True, aggregate='sum', activation=None, **kwargs):
+        super().__init__(aggregate=aggregate, activation=activation, **kwargs)
         self.trainable = trainable
 
     def build(self, input_shape):
@@ -77,11 +77,8 @@ def message(self, x, x_norm=None):
 
         return alpha * x_j
 
-    def get_config(self):
-        config = {
+    @property
+    def config(self):
+        return {
             'trainable': self.trainable,
         }
-        base_config = super().get_config()
-        base_config.pop('aggregate')  # Remove it because it's defined by constructor
-
-        return {**base_config, **config}
diff --git a/spektral/layers/convolutional/appnp_conv.py b/spektral/layers/convolutional/appnp_conv.py
index dc4e2026..334044b1 100644
--- a/spektral/layers/convolutional/appnp_conv.py
+++ b/spektral/layers/convolutional/appnp_conv.py
@@ -3,10 +3,11 @@
 from tensorflow.keras.models import Sequential
 
 from spektral.layers import ops
-from spektral.layers.convolutional.gcn_conv import GCNConv
+from spektral.layers.convolutional.conv import Conv
+from spektral.utils import gcn_filter
 
 
-class APPNPConv(GCNConv):
+class APPNPConv(Conv):
     r"""
     The APPNP operator from the paper
 
@@ -72,8 +73,7 @@ def __init__(self,
                  kernel_constraint=None,
                  bias_constraint=None,
                  **kwargs):
-        super().__init__(channels,
-                         activation=activation,
+        super().__init__(activation=activation,
                          use_bias=use_bias,
                          kernel_initializer=kernel_initializer,
                          bias_initializer=bias_initializer,
@@ -83,6 +83,7 @@ def __init__(self,
                          kernel_constraint=kernel_constraint,
                          bias_constraint=bias_constraint,
                          **kwargs)
+        self.channels = channels
         self.mlp_hidden = mlp_hidden if mlp_hidden else []
         self.alpha = alpha
         self.propagations = propagations
@@ -101,41 +102,34 @@ def build(self, input_shape):
         )
         mlp_layers = []
         for i, channels in enumerate(self.mlp_hidden):
-            mlp_layers.extend([
-                Dropout(self.dropout_rate),
-                Dense(channels, self.mlp_activation, **layer_kwargs)
-            ])
-        mlp_layers.append(
-            Dense(self.channels, 'linear', **layer_kwargs)
-        )
+            mlp_layers.extend([Dropout(self.dropout_rate),
+                               Dense(channels, self.mlp_activation, **layer_kwargs)])
+        mlp_layers.append(Dense(self.channels, 'linear', **layer_kwargs))
         self.mlp = Sequential(mlp_layers)
         self.built = True
 
     def call(self, inputs):
-        features = inputs[0]
-        fltr = inputs[1]
-
-        # Compute MLP hidden features
-        mlp_out = self.mlp(features)
+        x, a = inputs
 
-        # Propagation
+        mlp_out = self.mlp(x)
         z = mlp_out
         for k in range(self.propagations):
-            z = (1 - self.alpha) * ops.filter_dot(fltr, z) + self.alpha * mlp_out
+            z = (1 - self.alpha) * ops.filter_dot(a, z) + self.alpha * mlp_out
+        output = self.activation(z)
 
-        if self.activation is not None:
-            output = self.activation(z)
-        else:
-            output = z
         return output
 
-    def get_config(self):
-        config = {
+    @property
+    def config(self):
+        return {
+            'channels': self.channels,
             'alpha': self.alpha,
             'propagations': self.propagations,
             'mlp_hidden': self.mlp_hidden,
             'mlp_activation': activations.serialize(self.mlp_activation),
             'dropout_rate': self.dropout_rate,
         }
-        base_config = super().get_config()
-        return dict(list(base_config.items()) + list(config.items()))
+
+    @staticmethod
+    def preprocess(a):
+        return gcn_filter(a)
diff --git a/spektral/layers/convolutional/arma_conv.py b/spektral/layers/convolutional/arma_conv.py
index e7fc8cca..f6d40b6a 100644
--- a/spektral/layers/convolutional/arma_conv.py
+++ b/spektral/layers/convolutional/arma_conv.py
@@ -2,11 +2,11 @@
 from tensorflow.keras.layers import Dropout
 
 from spektral.layers import ops
-from spektral.layers.convolutional.gcn_conv import GCNConv
-from spektral.utils import normalized_laplacian, rescale_laplacian
+from spektral.layers.convolutional.conv import Conv
+from spektral.utils import normalized_adjacency
 
 
-class ARMAConv(GCNConv):
+class ARMAConv(Conv):
     r"""
     An Auto-Regressive Moving Average convolutional layer (ARMA) from the paper
 
@@ -81,8 +81,7 @@ def __init__(self,
                  kernel_constraint=None,
                  bias_constraint=None,
                  **kwargs):
-        super().__init__(channels,
-                         activation=activation,
+        super().__init__(activation=activation,
                          use_bias=use_bias,
                          kernel_initializer=kernel_initializer,
                          bias_initializer=bias_initializer,
@@ -92,6 +91,7 @@ def __init__(self,
                          kernel_constraint=kernel_constraint,
                          bias_constraint=bias_constraint,
                          **kwargs)
+        self.channels = channels
         self.iterations = iterations
         self.order = order
         self.share_weights = share_weights
@@ -115,24 +115,20 @@ def build(self, input_shape):
                 )
                 current_shape = self.channels
                 if self.share_weights and i == 1:
-                    # No need to continue because all following weights will be shared
+                    # No need to continue because all weights will be shared
                     break
             self.kernels.append(kernel_stack)
         self.built = True
 
     def call(self, inputs):
-        x = inputs[0]
-        a = inputs[1]
+        x, a = inputs
 
-        # Convolution
-        output = []  # Stores the parallel filters
+        output = []
         for k in range(self.order):
             output_k = x
             for i in range(self.iterations):
                 output_k = self.gcs([output_k, x, a], k, i)
             output.append(output_k)
-
-        # Average stacks
         output = K.stack(output, axis=-1)
         output = K.mean(output, axis=-1)
         output = self.activation(output)
@@ -182,43 +178,36 @@ def gcs(self, inputs, stack, iteration):
         :param iteration: int, current iteration (used to retrieve kernels);
         :return: output node features.
         """
-        X = inputs[0]
-        X_skip = inputs[1]
-        fltr = inputs[2]
+        x, x_skip, a = inputs
 
-        if self.share_weights and iteration >= 1:
-            iter = 1
-        else:
-            iter = iteration
+        iter = 1 if self.share_weights and iteration >= 1 else iteration
         kernel_1, kernel_2, bias = self.kernels[stack][iter]
 
-        # Convolution
-        output = K.dot(X, kernel_1)
-        output = ops.filter_dot(fltr, output)
+        output = K.dot(x, kernel_1)
+        output = ops.filter_dot(a, output)
 
-        # Skip connection
-        skip = K.dot(X_skip, kernel_2)
+        skip = K.dot(x_skip, kernel_2)
         skip = Dropout(self.dropout_rate)(skip)
         output += skip
 
         if self.use_bias:
             output = K.bias_add(output, bias)
         output = self.gcn_activation(output)
+
         return output
 
-    def get_config(self):
-        config = {
+    @property
+    def config(self):
+        return {
+            'channels': self.channels,
             'iterations': self.iterations,
             'order': self.order,
             'share_weights': self.share_weights,
             'gcn_activation': activations.serialize(self.gcn_activation),
             'dropout_rate': self.dropout_rate,
         }
-        base_config = super().get_config()
-        return dict(list(base_config.items()) + list(config.items()))
 
     @staticmethod
     def preprocess(a):
-        a = normalized_laplacian(a, symmetric=True)
-        a = rescale_laplacian(a, lmax=2)
-        return a
+        return normalized_adjacency(a, symmetric=True)
+
diff --git a/spektral/layers/convolutional/cheb_conv.py b/spektral/layers/convolutional/cheb_conv.py
index 38a3fd68..b66dfb34 100644
--- a/spektral/layers/convolutional/cheb_conv.py
+++ b/spektral/layers/convolutional/cheb_conv.py
@@ -1,11 +1,11 @@
 from tensorflow.keras import backend as K
 
 from spektral.layers import ops
-from spektral.layers.convolutional.gcn_conv import GCNConv
+from spektral.layers.convolutional.conv import Conv
 from spektral.utils import normalized_laplacian, rescale_laplacian
 
 
-class ChebConv(GCNConv):
+class ChebConv(Conv):
     r"""
     A Chebyshev convolutional layer from the paper
 
@@ -72,8 +72,7 @@ def __init__(self,
                  kernel_constraint=None,
                  bias_constraint=None,
                  **kwargs):
-        super().__init__(channels,
-                         activation=activation,
+        super().__init__(activation=activation,
                          use_bias=use_bias,
                          kernel_initializer=kernel_initializer,
                          bias_initializer=bias_initializer,
@@ -83,6 +82,7 @@ def __init__(self,
                          kernel_constraint=kernel_constraint,
                          bias_constraint=bias_constraint,
                          **kwargs)
+        self.channels = channels
         self.K = K
 
     def build(self, input_shape):
@@ -104,10 +104,8 @@ def build(self, input_shape):
         self.built = True
 
     def call(self, inputs):
-        x = inputs[0]
-        a = inputs[1]
+        x, a = inputs
 
-        # Convolution
         T_0 = x
         output = ops.dot(T_0, self.kernel[0])
 
@@ -122,16 +120,16 @@ def call(self, inputs):
 
         if self.use_bias:
             output = K.bias_add(output, self.bias)
-        if self.activation is not None:
-            output = self.activation(output)
+        output = self.activation(output)
+
         return output
 
-    def get_config(self):
-        config = {
+    @property
+    def config(self):
+        return {
+            'channels': self.channels,
             'K': self.K
         }
-        base_config = super(ChebConv, self).get_config()
-        return dict(list(base_config.items()) + list(config.items()))
 
     @staticmethod
     def preprocess(a):
diff --git a/spektral/layers/convolutional/conv.py b/spektral/layers/convolutional/conv.py
new file mode 100644
index 00000000..ef305290
--- /dev/null
+++ b/spektral/layers/convolutional/conv.py
@@ -0,0 +1,55 @@
+from tensorflow.keras.layers import Layer
+
+from spektral.utils.keras import is_layer_kwarg, deserialize_kwarg, is_keras_kwarg, serialize_kwarg
+
+
+class Conv(Layer):
+    r"""
+    A general class for convolutional layers.
+
+    You can extend this class to create custom implementations of GNN layers
+    that use standard matrix multiplication instead of the gather-scatter
+    approach of MessagePassing.
+
+    This is useful if you want to create layers that support dense inputs,
+    batch and mixed modes, or other non-standard processing. No checks are done
+    on the inputs, to allow for maximum flexibility.
+
+    Any extension of this class must implement the `call(self, inputs)` and
+    `config(self)` methods.
+
+    **Arguments**:
+
+    - ``**kwargs`: additional keyword arguments specific to Keras' Layers, like
+    regularizers, initializers, constraints, etc.
+    """
+    def __init__(self, **kwargs):
+        super().__init__(**{k: v for k, v in kwargs.items() if is_keras_kwarg(k)})
+        self.kwargs_keys = []
+        for key in kwargs:
+            if is_layer_kwarg(key):
+                attr = kwargs[key]
+                attr = deserialize_kwarg(key, attr)
+                self.kwargs_keys.append(key)
+                setattr(self, key, attr)
+
+    def build(self, input_shape):
+        self.built = True
+
+    def call(self, inputs):
+        raise NotImplementedError
+
+    def get_config(self):
+        base_config = super().get_config()
+        keras_config = {}
+        for key in self.kwargs_keys:
+            keras_config[key] = serialize_kwarg(key, getattr(self, key))
+        return {**base_config, **keras_config, **self.config}
+
+    @property
+    def config(self):
+        return {}
+
+    @staticmethod
+    def preprocess(a):
+        return a
diff --git a/spektral/layers/convolutional/crystal_conv.py b/spektral/layers/convolutional/crystal_conv.py
index aeb7f1b8..a3cb3fb8 100644
--- a/spektral/layers/convolutional/crystal_conv.py
+++ b/spektral/layers/convolutional/crystal_conv.py
@@ -53,6 +53,7 @@ class CrystalConv(MessagePassing):
 
     def __init__(self,
                  channels,
+                 aggregate='sum',
                  activation=None,
                  use_bias=True,
                  kernel_initializer='glorot_uniform',
@@ -63,7 +64,7 @@ def __init__(self,
                  kernel_constraint=None,
                  bias_constraint=None,
                  **kwargs):
-        super().__init__(aggregate='sum',
+        super().__init__(aggregate=aggregate,
                          activation=activation,
                          use_bias=use_bias,
                          kernel_initializer=kernel_initializer,
@@ -74,7 +75,7 @@ def __init__(self,
                          kernel_constraint=kernel_constraint,
                          bias_constraint=bias_constraint,
                          **kwargs)
-        self.channels = self.output_dim = channels
+        self.channels = channels
 
     def build(self, input_shape):
         assert len(input_shape) >= 2
@@ -102,10 +103,8 @@ def message(self, x, e=None):
     def update(self, embeddings, x=None):
         return x + embeddings
 
-    def get_config(self):
-        config = {
+    @property
+    def config(self):
+        return {
             'channels': self.channels
         }
-        base_config = super().get_config()
-        base_config.pop('aggregate')  # Remove it because it's defined by constructor
-        return {**base_config, **config}
diff --git a/spektral/layers/convolutional/diffusion_conv.py b/spektral/layers/convolutional/diffusion_conv.py
index e8f2aba0..c739daa5 100644
--- a/spektral/layers/convolutional/diffusion_conv.py
+++ b/spektral/layers/convolutional/diffusion_conv.py
@@ -1,6 +1,8 @@
 import tensorflow as tf
 import tensorflow.keras.layers as layers
-from spektral.layers.convolutional.gcn_conv import GCNConv
+
+from spektral.layers.convolutional.conv import Conv
+from spektral.utils import gcn_filter
 
 
 class DiffuseFeatures(layers.Layer):
@@ -70,7 +72,7 @@ def call(self, inputs):
         return tf.expand_dims(H, -1)
 
 
-class DiffusionConv(GCNConv):
+class DiffusionConv(Conv):
     r"""
     A diffusion convolution operator from the paper
 
@@ -103,7 +105,7 @@ class DiffusionConv(GCNConv):
     **Arguments**
 
     - `channels`: number of output channels;
-    - `num_diffusion_steps`: How many diffusion steps to consider. \(K\) in paper.
+    - `K`: number of diffusion steps.
     - `activation`: activation function \(\sigma\); (\(\tanh\) by default)
     - `kernel_initializer`: initializer for the weights;
     - `kernel_regularizer`: regularization applied to the weights;
@@ -112,25 +114,20 @@ class DiffusionConv(GCNConv):
 
     def __init__(self,
                  channels,
-                 num_diffusion_steps=6,
+                 K=6,
                  activation='tanh',
                  kernel_initializer='glorot_uniform',
                  kernel_regularizer=None,
                  kernel_constraint=None,
                  **kwargs):
-        super().__init__(channels,
-                         activation=activation,
+        super().__init__(activation=activation,
                          kernel_initializer=kernel_initializer,
                          kernel_regularizer=kernel_regularizer,
                          kernel_constraint=kernel_constraint,
                          **kwargs)
 
-        # number of features to generate (Q in paper)
-        assert channels > 0
-        self.Q = channels
-
-        # number of diffusion steps for each output feature
-        self.K = num_diffusion_steps + 1
+        self.channels = channels
+        self.K = K + 1
 
     def build(self, input_shape):
         self.filters = [
@@ -138,16 +135,16 @@ def build(self, input_shape):
                             kernel_initializer=self.kernel_initializer,
                             kernel_regularizer=self.kernel_regularizer,
                             kernel_constraint=self.kernel_constraint)
-            for _ in range(self.Q)]
+            for _ in range(self.channels)]
 
     def apply_filters(self, x, a):
-        # This will be a list of Q diffused features.
+        # This will be a list of channels diffused features.
         # Each diffused feature is a (batch, n_nodes, 1) tensor.
         # Later we will concat all the features to get one
-        # (batch, n_nodes, Q) diffused graph signal
+        # (batch, n_nodes, channels) diffused graph signal
         diffused_features = []
 
-        # Iterating over all Q diffusion filters
+        # Iterating over all channels diffusion filters
         for diffusion in self.filters:
             diffused_feature = diffusion((x, a))
             diffused_features.append(diffused_feature)
@@ -160,3 +157,14 @@ def call(self, inputs):
         h = self.activation(h)
 
         return h
+
+    @property
+    def config(self):
+        return {
+            'channels': self.channels,
+            'K': self.K - 1
+        }
+
+    @staticmethod
+    def preprocess(a):
+        return gcn_filter(a)
diff --git a/spektral/layers/convolutional/ecc_conv.py b/spektral/layers/convolutional/ecc_conv.py
index 06728d84..7a196e8f 100644
--- a/spektral/layers/convolutional/ecc_conv.py
+++ b/spektral/layers/convolutional/ecc_conv.py
@@ -3,11 +3,11 @@
 from tensorflow.keras.layers import Dense
 
 from spektral.layers import ops
+from spektral.layers.convolutional.conv import Conv
 from spektral.layers.ops import modes
-from spektral.layers.convolutional.gcn_conv import GCNConv
 
 
-class ECCConv(GCNConv):
+class ECCConv(Conv):
     r"""
     An edge-conditioned convolutional layer (ECC) from the paper
 
@@ -75,8 +75,7 @@ def __init__(self,
                  kernel_constraint=None,
                  bias_constraint=None,
                  **kwargs):
-        super().__init__(channels,
-                         activation=activation,
+        super().__init__(activation=activation,
                          use_bias=use_bias,
                          kernel_initializer=kernel_initializer,
                          bias_initializer=bias_initializer,
@@ -86,6 +85,7 @@ def __init__(self,
                          kernel_constraint=kernel_constraint,
                          bias_constraint=bias_constraint,
                          **kwargs)
+        self.channels = channels
         self.kernel_network = kernel_network
         self.root = root
 
@@ -199,14 +199,10 @@ def _call_single(self, inputs):
 
         return output
 
-    def get_config(self):
-        config = {
+    @property
+    def config(self):
+        return {
+            'channels': self.channels,
             'kernel_network': self.kernel_network,
             'root': self.root,
         }
-        base_config = super().get_config()
-        return dict(list(base_config.items()) + list(config.items()))
-
-    @staticmethod
-    def preprocess(a):
-        return a
diff --git a/spektral/layers/convolutional/edge_conv.py b/spektral/layers/convolutional/edge_conv.py
index 34608ced..6e8ec58c 100644
--- a/spektral/layers/convolutional/edge_conv.py
+++ b/spektral/layers/convolutional/edge_conv.py
@@ -54,6 +54,7 @@ def __init__(self,
                  channels,
                  mlp_hidden=None,
                  mlp_activation='relu',
+                 aggregate='sum',
                  activation=None,
                  use_bias=True,
                  kernel_initializer='glorot_uniform',
@@ -64,7 +65,7 @@ def __init__(self,
                  kernel_constraint=None,
                  bias_constraint=None,
                  **kwargs):
-        super().__init__(aggregate='sum',
+        super().__init__(aggregate=aggregate,
                          activation=activation,
                          use_bias=use_bias,
                          kernel_initializer=kernel_initializer,
@@ -75,7 +76,7 @@ def __init__(self,
                          kernel_constraint=kernel_constraint,
                          bias_constraint=bias_constraint,
                          **kwargs)
-        self.channels = self.output_dim = channels
+        self.channels = channels
         self.mlp_hidden = mlp_hidden if mlp_hidden else []
         self.mlp_activation = activations.get(mlp_activation)
 
@@ -102,12 +103,10 @@ def message(self, x, **kwargs):
         x_j = self.get_j(x)
         return self.mlp(K.concatenate((x_i, x_j - x_i)))
 
-    def get_config(self):
-        config = {
+    @property
+    def config(self):
+        return {
             'channels': self.channels,
             'mlp_hidden': self.mlp_hidden,
             'mlp_activation': self.mlp_activation
         }
-        base_config = super().get_config()
-        base_config.pop('aggregate')  # Remove it because it's defined by constructor
-        return {**base_config, **config}
diff --git a/spektral/layers/convolutional/gat_conv.py b/spektral/layers/convolutional/gat_conv.py
index ec9df941..25201165 100644
--- a/spektral/layers/convolutional/gat_conv.py
+++ b/spektral/layers/convolutional/gat_conv.py
@@ -4,11 +4,11 @@
 from tensorflow.keras.layers import Dropout
 
 from spektral.layers import ops
-from spektral.layers.convolutional.gcn_conv import GCNConv
+from spektral.layers.convolutional.conv import Conv
 from spektral.layers.ops import modes
 
 
-class GATConv(GCNConv):
+class GATConv(Conv):
     r"""
     A Graph Attention layer (GAT) from the paper
 
@@ -93,8 +93,7 @@ def __init__(self,
                  bias_constraint=None,
                  attn_kernel_constraint=None,
                  **kwargs):
-        super().__init__(channels,
-                         activation=activation,
+        super().__init__(activation=activation,
                          use_bias=use_bias,
                          kernel_initializer=kernel_initializer,
                          bias_initializer=bias_initializer,
@@ -104,6 +103,7 @@ def __init__(self,
                          kernel_constraint=kernel_constraint,
                          bias_constraint=bias_constraint,
                          **kwargs)
+        self.channels = channels
         self.attn_heads = attn_heads
         self.concat_heads = concat_heads
         self.dropout_rate = dropout_rate
@@ -113,10 +113,8 @@ def __init__(self,
         self.attn_kernel_constraint = constraints.get(attn_kernel_constraint)
 
         if concat_heads:
-            # Output will have shape (..., attention_heads * channels)
             self.output_dim = self.channels * self.attn_heads
         else:
-            # Output will have shape (..., channels)
             self.output_dim = self.channels
 
     def build(self, input_shape):
@@ -237,12 +235,10 @@ def _call_dense(self, x, a):
 
         return output, attn_coef
 
-    def compute_output_shape(self, input_shape):
-        output_shape = input_shape[0][:-1] + (self.output_dim,)
-        return output_shape
-
-    def get_config(self):
-        config = {
+    @property
+    def config(self):
+        return {
+            'channels': self.channels,
             'attn_heads': self.attn_heads,
             'concat_heads': self.concat_heads,
             'dropout_rate': self.dropout_rate,
@@ -251,9 +247,3 @@ def get_config(self):
             'attn_kernel_regularizer': regularizers.serialize(self.attn_kernel_regularizer),
             'attn_kernel_constraint': constraints.serialize(self.attn_kernel_constraint),
         }
-        base_config = super().get_config()
-        return dict(list(base_config.items()) + list(config.items()))
-
-    @staticmethod
-    def preprocess(a):
-        return a
diff --git a/spektral/layers/convolutional/gated_graph_conv.py b/spektral/layers/convolutional/gated_graph_conv.py
index 3bac8070..a8bc038c 100644
--- a/spektral/layers/convolutional/gated_graph_conv.py
+++ b/spektral/layers/convolutional/gated_graph_conv.py
@@ -75,7 +75,7 @@ def __init__(self,
                          kernel_constraint=kernel_constraint,
                          bias_constraint=bias_constraint,
                          **kwargs)
-        self.channels = self.output_dim = channels
+        self.channels = channels
         self.n_layers = n_layers
 
     def build(self, input_shape):
@@ -115,10 +115,9 @@ def call(self, inputs):
         output = self.activation(output)
         return output
 
-    def get_config(self):
-        config = {
+    @property
+    def config(self):
+        return {
             'channels': self.channels,
             'n_layers': self.n_layers,
         }
-        base_config = super().get_config()
-        return {**base_config, **config}
diff --git a/spektral/layers/convolutional/gcn_conv.py b/spektral/layers/convolutional/gcn_conv.py
index a721acab..b0a393df 100644
--- a/spektral/layers/convolutional/gcn_conv.py
+++ b/spektral/layers/convolutional/gcn_conv.py
@@ -1,12 +1,11 @@
-from tensorflow.keras import activations, initializers, regularizers, constraints
 from tensorflow.keras import backend as K
-from tensorflow.keras.layers import Layer
 
 from spektral.layers import ops
+from spektral.layers.convolutional.conv import Conv
 from spektral.utils import gcn_filter
 
 
-class GCNConv(Layer):
+class GCNConv(Conv):
     r"""
     A graph convolutional layer (GCN) from the paper
 
@@ -59,18 +58,17 @@ def __init__(self,
                  kernel_constraint=None,
                  bias_constraint=None,
                  **kwargs):
-
-        super().__init__(activity_regularizer=activity_regularizer, **kwargs)
+        super().__init__(activation=activation,
+                         use_bias=use_bias,
+                         kernel_initializer=kernel_initializer,
+                         bias_initializer=bias_initializer,
+                         kernel_regularizer=kernel_regularizer,
+                         bias_regularizer=bias_regularizer,
+                         activity_regularizer=activity_regularizer,
+                         kernel_constraint=kernel_constraint,
+                         bias_constraint=bias_constraint,
+                         **kwargs)
         self.channels = channels
-        self.activation = activations.get(activation)
-        self.use_bias = use_bias
-        self.kernel_initializer = initializers.get(kernel_initializer)
-        self.bias_initializer = initializers.get(bias_initializer)
-        self.kernel_regularizer = regularizers.get(kernel_regularizer)
-        self.bias_regularizer = regularizers.get(bias_regularizer)
-        self.kernel_constraint = constraints.get(kernel_constraint)
-        self.bias_constraint = constraints.get(bias_constraint)
-        self.supports_masking = False
 
     def build(self, input_shape):
         assert len(input_shape) >= 2
@@ -92,34 +90,21 @@ def build(self, input_shape):
 
     def call(self, inputs):
         x, a = inputs
+
         output = ops.dot(x, self.kernel)
         output = ops.filter_dot(a, output)
 
         if self.use_bias:
             output = K.bias_add(output, self.bias)
-        if self.activation is not None:
-            output = self.activation(output)
+        output = self.activation(output)
+
         return output
 
-    def compute_output_shape(self, input_shape):
-        features_shape = input_shape[0]
-        output_shape = features_shape[:-1] + (self.channels,)
-        return output_shape
-
-    def get_config(self):
-        config = {
-            'channels': self.channels,
-            'activation': activations.serialize(self.activation),
-            'use_bias': self.use_bias,
-            'kernel_initializer': initializers.serialize(self.kernel_initializer),
-            'bias_initializer': initializers.serialize(self.bias_initializer),
-            'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
-            'bias_regularizer': regularizers.serialize(self.bias_regularizer),
-            'kernel_constraint': constraints.serialize(self.kernel_constraint),
-            'bias_constraint': constraints.serialize(self.bias_constraint)
+    @property
+    def config(self):
+        return {
+            'channels': self.channels
         }
-        base_config = super().get_config()
-        return dict(list(base_config.items()) + list(config.items()))
 
     @staticmethod
     def preprocess(a):
diff --git a/spektral/layers/convolutional/gcs_conv.py b/spektral/layers/convolutional/gcs_conv.py
index 3123cdf8..cf301ed9 100644
--- a/spektral/layers/convolutional/gcs_conv.py
+++ b/spektral/layers/convolutional/gcs_conv.py
@@ -1,11 +1,11 @@
 from tensorflow.keras import backend as K
 
 from spektral.layers import ops
-from spektral.layers.convolutional.gcn_conv import GCNConv
+from spektral.layers.convolutional.conv import Conv
 from spektral.utils import normalized_adjacency
 
 
-class GCSConv(GCNConv):
+class GCSConv(Conv):
     r"""
     A simple convolutional layer with a skip connection.
 
@@ -55,8 +55,7 @@ def __init__(self,
                  kernel_constraint=None,
                  bias_constraint=None,
                  **kwargs):
-        super().__init__(channels,
-                         activation=activation,
+        super().__init__(activation=activation,
                          use_bias=use_bias,
                          kernel_initializer=kernel_initializer,
                          bias_initializer=bias_initializer,
@@ -66,6 +65,7 @@ def __init__(self,
                          kernel_constraint=kernel_constraint,
                          bias_constraint=bias_constraint,
                          **kwargs)
+        self.channels = channels
 
     def build(self, input_shape):
         assert len(input_shape) >= 2
@@ -105,6 +105,12 @@ def call(self, inputs):
             output = self.activation(output)
         return output
 
+    @property
+    def config(self):
+        return {
+            'channels': self.channels
+        }
+
     @staticmethod
     def preprocess(a):
         return normalized_adjacency(a)
diff --git a/spektral/layers/convolutional/general_conv.py b/spektral/layers/convolutional/general_conv.py
index 7cdca49c..39cf8b71 100644
--- a/spektral/layers/convolutional/general_conv.py
+++ b/spektral/layers/convolutional/general_conv.py
@@ -98,10 +98,10 @@ def __init__(self,
                          kernel_constraint=kernel_constraint,
                          bias_constraint=bias_constraint,
                          **kwargs)
-        self.channels = self.output_dim = channels
+        self.channels = channels
         self.dropout_rate = dropout
         self.use_batch_norm = batch_norm
-        if activation == 'prelu':
+        if activation == 'prelu' or 'prelu' in kwargs:
             self.activation = PReLU()
         else:
             self.activation = activations.get(activation)
@@ -139,13 +139,12 @@ def call(self, inputs, **kwargs):
 
         return self.propagate(x, a)
 
-    def get_config(self):
+    @property
+    def config(self):
         config = {
             'channels': self.channels,
         }
-        base_config = super().get_config()
-        if isinstance(self.activation, PReLU):
-            base_config['activation'] = 'prelu'
-        else:
-            base_config['activation'] = activations.serialize(self.activation)
-        return {**base_config, **config}
\ No newline at end of file
+        if self.activation.__class__.__name__ == 'PReLU':
+            config['prelu'] = True
+
+        return config
diff --git a/spektral/layers/convolutional/gin_conv.py b/spektral/layers/convolutional/gin_conv.py
index bae79624..353e62d6 100644
--- a/spektral/layers/convolutional/gin_conv.py
+++ b/spektral/layers/convolutional/gin_conv.py
@@ -59,6 +59,7 @@ def __init__(self,
                  epsilon=None,
                  mlp_hidden=None,
                  mlp_activation='relu',
+                 aggregate='sum',
                  activation=None,
                  use_bias=True,
                  kernel_initializer='glorot_uniform',
@@ -69,7 +70,7 @@ def __init__(self,
                  kernel_constraint=None,
                  bias_constraint=None,
                  **kwargs):
-        super().__init__(aggregate='sum',
+        super().__init__(aggregate=aggregate,
                          activation=activation,
                          use_bias=use_bias,
                          kernel_initializer=kernel_initializer,
@@ -80,7 +81,7 @@ def __init__(self,
                          kernel_constraint=kernel_constraint,
                          bias_constraint=bias_constraint,
                          **kwargs)
-        self.channels = self.output_dim = channels
+        self.channels = channels
         self.epsilon = epsilon
         self.mlp_hidden = mlp_hidden if mlp_hidden else []
         self.mlp_activation = activations.get(mlp_activation)
@@ -117,13 +118,11 @@ def call(self, inputs, **kwargs):
 
         return output
 
-    def get_config(self):
-        config = {
+    @property
+    def config(self):
+        return{
             'channels': self.channels,
             'epsilon': self.epsilon,
             'mlp_hidden': self.mlp_hidden,
             'mlp_activation': self.mlp_activation
         }
-        base_config = super().get_config()
-        base_config.pop('aggregate')  # Remove it because it's defined by constructor
-        return {**base_config, **config}
diff --git a/spektral/layers/convolutional/graphsage_conv.py b/spektral/layers/convolutional/graphsage_conv.py
index 0e4e1541..93219cbf 100644
--- a/spektral/layers/convolutional/graphsage_conv.py
+++ b/spektral/layers/convolutional/graphsage_conv.py
@@ -75,7 +75,7 @@ def __init__(self,
                          kernel_constraint=kernel_constraint,
                          bias_constraint=bias_constraint,
                          **kwargs)
-        self.channels = self.output_dim = channels
+        self.channels = channels
 
     def build(self, input_shape):
         assert len(input_shape) >= 2
@@ -111,13 +111,8 @@ def call(self, inputs):
 
         return output
 
-    def get_config(self):
-        config = {
+    @property
+    def config(self):
+        return{
             'channels': self.channels
         }
-        base_config = super().get_config()
-        return dict(list(base_config.items()) + list(config.items()))
-
-    @staticmethod
-    def preprocess(a):
-        return a
diff --git a/spektral/layers/convolutional/message_passing.py b/spektral/layers/convolutional/message_passing.py
index 6e2b3069..1df98215 100644
--- a/spektral/layers/convolutional/message_passing.py
+++ b/spektral/layers/convolutional/message_passing.py
@@ -5,7 +5,7 @@
 from tensorflow.keras.layers import Layer
 
 from spektral.utils.keras import is_layer_kwarg, is_keras_kwarg, deserialize_kwarg, serialize_kwarg
-from spektral.layers.ops.scatter import deserialize_scatter
+from spektral.layers.ops.scatter import deserialize_scatter, serialize_scatter
 
 
 class MessagePassing(Layer):
@@ -33,27 +33,39 @@ class MessagePassing(Layer):
     By extending this class, it is possible to create any message-passing layer
     in single/disjoint mode.
 
-    **API:**
-
-    - `propagate(X, A, E=None, **kwargs)`: propagate the messages and compute
-    embeddings for each node in the graph. `kwargs` will be propagated as
-    keyword arguments to `message()`, `aggregate()` and `update()`.
-    - `message(X, **kwargs)`: computes messages, equivalent to \(\phi\) in the
-    definition.
-    Any extra keyword argument of this function will be  populated by
-    `propagate()` if a matching keyword is found.
+    **API**
+
+    ```python
+    propagate(x, a, e=None, **kwargs)
+    ```
+    Propagates the messages and computes embeddings for each node in the graph. <br>
+    Any `kwargs` will be forwarded as keyword arguments to `message()`,
+    `aggregate()` and `update()`.
+
+    ```python
+    message(x, **kwargs)
+    ```
+    Computes messages, equivalent to \(\phi\) in the definition. <br>
+    Any extra keyword argument of this function will be populated by
+    `propagate()` if a matching keyword is found. <br>
     Use `self.get_i()` and  `self.get_j()` to gather the elements using the
-    indices `i` or `j` of the adjacency matrix (e.g, `self.get_j(X)` will get
-    the features of the neighbours).
-    - `aggregate(messages, **kwargs)`: aggregates the messages, equivalent to
-    \(\square\) in the definition.
+    indices `i` or `j` of the adjacency matrix.
+
+    ```python
+    aggregate(messages, **kwargs)
+    ```
+    Aggregates the messages, equivalent to \(\square\) in the definition. <br>
     The behaviour of this function can also be controlled using the `aggregate`
     keyword in the constructor of the layer (supported aggregations: sum, mean,
-    max, min, prod).
+    max, min, prod). <br>
     Any extra keyword argument of this function will be  populated by
     `propagate()` if a matching keyword is found.
-    - `update(embeddings, **kwargs)`: updates the aggregated messages to obtain
-    the final node embeddings, equivalent to \(\gamma\) in the definition.
+
+    ```python
+    update(embeddings, **kwargs)
+    ```
+    Updates the aggregated messages to obtain the final node embeddings,
+    equivalent to \(\gamma\) in the definition. <br>
     Any extra keyword argument of this function will be  populated by
     `propagate()` if a matching keyword is found.
 
@@ -64,10 +76,11 @@ class MessagePassing(Layer):
     Supported aggregations: 'sum', 'mean', 'max', 'min', 'prod'.
     If callable, the function must have the signature `foo(updates, indices, n_nodes)`
     and return a rank 2 tensor with shape `(n_nodes, ...)`.
+    - `kwargs`: additional keyword arguments specific to Keras' Layers, like
+    regularizers, initializers, constraints, etc.
     """
     def __init__(self, aggregate='sum', **kwargs):
         super().__init__(**{k: v for k, v in kwargs.items() if is_keras_kwarg(k)})
-        self.output_dim = None
         self.kwargs_keys = []
         for key in kwargs:
             if is_layer_kwarg(key):
@@ -158,21 +171,20 @@ def get_inputs(inputs):
 
         return x, a, e
 
-    def compute_output_shape(self, input_shape):
-        if self.output_dim:
-            output_shape = input_shape[0][:-1] + (self.output_dim, )
-        else:
-            output_shape = input_shape[0]
-        return output_shape
-
     def get_config(self):
-        config = {
-            'aggregate': self.agg,
+        mp_config = {
+            'aggregate': serialize_scatter(self.agg)
         }
+        keras_config = {}
         for key in self.kwargs_keys:
-            config[key] = serialize_kwarg(key, getattr(self, key))
+            keras_config[key] = serialize_kwarg(key, getattr(self, key))
         base_config = super().get_config()
-        return {**base_config, **config}
+
+        return {**base_config, **keras_config, **mp_config, **self.config}
+
+    @property
+    def config(self):
+        return {}
 
     @staticmethod
     def preprocess(a):
diff --git a/spektral/layers/convolutional/tag_conv.py b/spektral/layers/convolutional/tag_conv.py
index 68809c5d..aea20a70 100644
--- a/spektral/layers/convolutional/tag_conv.py
+++ b/spektral/layers/convolutional/tag_conv.py
@@ -50,6 +50,7 @@ class TAGConv(MessagePassing):
     def __init__(self,
                  channels,
                  K=3,
+                 aggregate='sum',
                  activation=None,
                  use_bias=True,
                  kernel_initializer='glorot_uniform',
@@ -60,7 +61,7 @@ def __init__(self,
                  kernel_constraint=None,
                  bias_constraint=None,
                  **kwargs):
-        super().__init__(aggregate='sum',
+        super().__init__(aggregate=aggregate,
                          activation=activation,
                          use_bias=use_bias,
                          kernel_initializer=kernel_initializer,
@@ -71,7 +72,7 @@ def __init__(self,
                          kernel_constraint=kernel_constraint,
                          bias_constraint=bias_constraint,
                          **kwargs)
-        self.channels = self.output_dim = channels
+        self.channels = channels
         self.K = K
         self.linear = Dense(channels,
                             activation=activation,
@@ -103,13 +104,11 @@ def message(self, x, edge_weight=None):
         x_j = self.get_j(x)
         return edge_weight[:, None] * x_j
 
-    def get_config(self):
-        config = {
+    @property
+    def config(self):
+        return{
             'channels': self.channels,
         }
-        base_config = super().get_config()
-        base_config.pop('aggregate')  # Remove it because it's defined by constructor
-        return {**base_config, **config}
 
     @staticmethod
     def preprocess(a):
diff --git a/spektral/layers/ops/scatter.py b/spektral/layers/ops/scatter.py
index b1fe509e..07742287 100644
--- a/spektral/layers/ops/scatter.py
+++ b/spektral/layers/ops/scatter.py
@@ -90,6 +90,16 @@ def scatter_prod(updates, indices, N):
 }
 
 
+def serialize_scatter(identifier):
+    if identifier in OP_DICT:
+        return identifier
+    elif hasattr(identifier, '__name__'):
+        for k, v in OP_DICT.items():
+            if v.__name__ == identifier.__name__:
+                return k
+        return None
+
+
 def deserialize_scatter(scatter):
     if isinstance(scatter, str):
         if scatter in OP_DICT:
diff --git a/spektral/utils/keras.py b/spektral/utils/keras.py
index 1939bbf2..7f09ac0b 100644
--- a/spektral/utils/keras.py
+++ b/spektral/utils/keras.py
@@ -6,10 +6,9 @@
 
 
 def is_layer_kwarg(key):
-    return (key.endswith('_initializer')
-            or key.endswith('_regularizer')
-            or key.endswith('_constraint')
-            or key in LAYER_KWARGS) and not key in KERAS_KWARGS
+    return (key not in KERAS_KWARGS and (key.endswith('_initializer')
+            or key.endswith('_regularizer') or key.endswith('_constraint')
+            or key in LAYER_KWARGS))
 
 
 def is_keras_kwarg(key):
@@ -37,4 +36,4 @@ def serialize_kwarg(key, attr):
     if key == 'activation':
         return activations.serialize(attr)
     if key == 'use_bias':
-        return attr
\ No newline at end of file
+        return attr
diff --git a/tests/test_layers/test_convolutional.py b/tests/test_layers/test_convolutional.py
index 92096c48..375a2548 100644
--- a/tests/test_layers/test_convolutional.py
+++ b/tests/test_layers/test_convolutional.py
@@ -220,7 +220,17 @@ def _test_get_config(layer, **kwargs):
         kwargs.pop('edges')
     layer_instance = layer(**kwargs)
     config = layer_instance.get_config()
-    assert layer(**config)
+    layer_instance_new = layer(**config)
+    config_new = layer_instance_new.get_config()
+    config.pop('name')
+    config_new.pop('name')
+
+    # Remove 'name' if we have advanced activations (needed for GeneralConv)
+    if 'activation' in config and 'class_name' in config['activation']:
+        config['activation']['config'].pop('name')
+        config_new['activation']['config'].pop('name')
+
+    assert config_new == config
 
 
 def test_layers():

From 3c9747e797767b073b83e72abf942131e84dc553 Mon Sep 17 00:00:00 2001
From: Daniele Grattarola <daniele.grattarola@gmail.com>
Date: Fri, 27 Nov 2020 13:43:49 +0100
Subject: [PATCH 51/57] Add Pool class

---
 spektral/layers/convolutional/general_conv.py |  7 +-
 .../layers/convolutional/message_passing.py   |  2 +-
 spektral/layers/ops/__init__.py               |  4 +-
 spektral/layers/pooling/diff_pool.py          | 49 ++++---------
 spektral/layers/pooling/global_pool.py        |  9 ---
 spektral/layers/pooling/mincut_pool.py        | 70 +++++++------------
 spektral/layers/pooling/pool.py               | 45 ++++++++++++
 spektral/layers/pooling/sag_pool.py           | 10 +--
 spektral/layers/pooling/topk_pool.py          | 43 +++++-------
 tests/test_layers/test_pooling.py             |  9 +--
 10 files changed, 112 insertions(+), 136 deletions(-)
 create mode 100644 spektral/layers/pooling/pool.py

diff --git a/spektral/layers/convolutional/general_conv.py b/spektral/layers/convolutional/general_conv.py
index 39cf8b71..764daa84 100644
--- a/spektral/layers/convolutional/general_conv.py
+++ b/spektral/layers/convolutional/general_conv.py
@@ -1,8 +1,9 @@
-from spektral.layers.convolutional.message_passing import MessagePassing
-from tensorflow.keras.layers import Dropout, BatchNormalization, PReLU
-from spektral.layers.ops import dot
 import tensorflow as tf
 from tensorflow.keras import activations
+from tensorflow.keras.layers import Dropout, BatchNormalization, PReLU
+
+from spektral.layers.convolutional.message_passing import MessagePassing
+from spektral.layers.ops import dot
 
 
 class GeneralConv(MessagePassing):
diff --git a/spektral/layers/convolutional/message_passing.py b/spektral/layers/convolutional/message_passing.py
index 1df98215..f227ea7c 100644
--- a/spektral/layers/convolutional/message_passing.py
+++ b/spektral/layers/convolutional/message_passing.py
@@ -4,8 +4,8 @@
 from tensorflow.keras import backend as K
 from tensorflow.keras.layers import Layer
 
-from spektral.utils.keras import is_layer_kwarg, is_keras_kwarg, deserialize_kwarg, serialize_kwarg
 from spektral.layers.ops.scatter import deserialize_scatter, serialize_scatter
+from spektral.utils.keras import is_layer_kwarg, is_keras_kwarg, deserialize_kwarg, serialize_kwarg
 
 
 class MessagePassing(Layer):
diff --git a/spektral/layers/ops/__init__.py b/spektral/layers/ops/__init__.py
index 733b6483..dc8fde24 100644
--- a/spektral/layers/ops/__init__.py
+++ b/spektral/layers/ops/__init__.py
@@ -1,7 +1,7 @@
-from .ops import *
-from .matmul import *
 from .graph import *
+from .matmul import *
 from .modes import *
+from .ops import *
 from .scatter import *
 from .sparse import *
 
diff --git a/spektral/layers/pooling/diff_pool.py b/spektral/layers/pooling/diff_pool.py
index d0a93a9a..572fe38e 100644
--- a/spektral/layers/pooling/diff_pool.py
+++ b/spektral/layers/pooling/diff_pool.py
@@ -1,13 +1,13 @@
 import tensorflow as tf
-from tensorflow.keras import activations, initializers, regularizers, constraints
+from tensorflow.keras import activations
 from tensorflow.keras import backend as K
-from tensorflow.keras.layers import Layer
 
 from spektral.layers import ops
 from spektral.layers.ops import modes
+from spektral.layers.pooling.pool import Pool
 
 
-class DiffPool(Layer):
+class DiffPool(Pool):
     r"""
     A DiffPool layer from the paper
 
@@ -18,7 +18,7 @@ class DiffPool(Layer):
 
     This layer computes a soft clustering \(\S\) of the input graphs using a GNN,
     and reduces graphs as follows:
-$$
+    $$
         \S = \textrm{GNN}(\A, \X); \\
         \A' = \S^\top \A \S; \X' = \S^\top \X;
     $$
@@ -72,14 +72,14 @@ def __init__(self,
                  kernel_constraint=None,
                  **kwargs):
 
-        super().__init__(**kwargs)
+        super().__init__(activation=activation,
+                         kernel_initializer=kernel_initializer,
+                         kernel_regularizer=kernel_regularizer,
+                         kernel_constraint=kernel_constraint,
+                         **kwargs)
         self.k = k
         self.channels = channels
         self.return_mask = return_mask
-        self.activation = activations.get(activation)
-        self.kernel_initializer = initializers.get(kernel_initializer)
-        self.kernel_regularizer = regularizers.get(kernel_regularizer)
-        self.kernel_constraint = constraints.get(kernel_constraint)
 
     def build(self, input_shape):
         assert isinstance(input_shape, list)
@@ -172,35 +172,10 @@ def call(self, inputs):
 
         return output
 
-    def compute_output_shape(self, input_shape):
-        X_shape = input_shape[0]
-        A_shape = input_shape[1]
-        X_shape_out = X_shape[:-2] + (self.k, self.channels)
-        if self.reduce_loss:
-            A_shape_out = X_shape[:-2] + (self.k, self.k)
-        else:
-            A_shape_out = A_shape[:-2] + (self.k, self.k)
-
-        output_shape = [X_shape_out, A_shape_out]
-
-        if len(input_shape) == 3:
-            I_shape_out = A_shape[:-2] + (self.k,)
-            output_shape.append(I_shape_out)
-
-        if self.return_mask:
-            S_shape_out = A_shape[:-1] + (self.k,)
-            output_shape.append(S_shape_out)
-
-        return output_shape
-
-    def get_config(self):
-        config = {
+    @property
+    def config(self):
+        return {
             'k': self.k,
             'channels': self.channels,
             'return_mask': self.return_mask,
-            'kernel_initializer': initializers.serialize(self.kernel_initializer),
-            'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
-            'kernel_constraint': constraints.serialize(self.kernel_constraint),
         }
-        base_config = super().get_config()
-        return dict(list(base_config.items()) + list(config.items()))
\ No newline at end of file
diff --git a/spektral/layers/pooling/global_pool.py b/spektral/layers/pooling/global_pool.py
index b415984e..e5742961 100644
--- a/spektral/layers/pooling/global_pool.py
+++ b/spektral/layers/pooling/global_pool.py
@@ -37,15 +37,6 @@ def call(self, inputs):
         else:
             return self.batch_pooling_op(X, axis=-2, keepdims=(self.data_mode == 'single'))
 
-    def compute_output_shape(self, input_shape):
-        if self.data_mode == 'single':
-            return (1,) + input_shape[-1:]
-        elif self.data_mode == 'batch':
-            return input_shape[:-2] + input_shape[-1:]
-        else:
-            # Input shape is a list of shapes for X and I
-            return input_shape[0]
-
     def get_config(self):
         return super().get_config()
 
diff --git a/spektral/layers/pooling/mincut_pool.py b/spektral/layers/pooling/mincut_pool.py
index 3094c8be..efdf38de 100644
--- a/spektral/layers/pooling/mincut_pool.py
+++ b/spektral/layers/pooling/mincut_pool.py
@@ -1,12 +1,13 @@
 import tensorflow as tf
 from tensorflow.keras import Sequential
-from tensorflow.keras import activations, initializers, regularizers, constraints, backend as K
-from tensorflow.keras.layers import Layer, Dense
+from tensorflow.keras import backend as K
+from tensorflow.keras.layers import Dense
 
 from spektral.layers import ops
+from spektral.layers.pooling.pool import Pool
 
 
-class MinCutPool(Layer):
+class MinCutPool(Pool):
     r"""
     A MinCut pooling layer from the paper
 
@@ -56,9 +57,13 @@ class MinCutPool(Layer):
     only the output layer);
     - `mlp_activation`: activation for the MLP layers;
     - `return_mask`: boolean, whether to return the cluster assignment matrix;
-    - `kernel_initializer`: initializer for the weights;
-    - `kernel_regularizer`: regularization applied to the weights;
-    - `kernel_constraint`: constraint applied to the weights;
+    - `use_bias`: use bias in the MLP;
+    - `kernel_initializer`: initializer for the weights of the MLP;
+    - `bias_initializer`: initializer for the bias of the MLP;
+    - `kernel_regularizer`: regularization applied to the weights of the MLP;
+    - `bias_regularizer`: regularization applied to the bias of the MLP;
+    - `kernel_constraint`: constraint applied to the weights of the MLP;
+    - `bias_constraint`: constraint applied to the bias of the MLP;
     """
 
     def __init__(self,
@@ -76,19 +81,19 @@ def __init__(self,
                  bias_constraint=None,
                  **kwargs):
 
-        super().__init__(**kwargs)
+        super().__init__(activation=activation,
+                         use_bias=use_bias,
+                         kernel_initializer=kernel_initializer,
+                         bias_initializer=bias_initializer,
+                         kernel_regularizer=kernel_regularizer,
+                         bias_regularizer=bias_regularizer,
+                         kernel_constraint=kernel_constraint,
+                         bias_constraint=bias_constraint,
+                         **kwargs)
         self.k = k
         self.mlp_hidden = mlp_hidden if mlp_hidden else []
         self.mlp_activation = mlp_activation
         self.return_mask = return_mask
-        self.activation = activations.get(activation)
-        self.use_bias = use_bias
-        self.kernel_initializer = initializers.get(kernel_initializer)
-        self.bias_initializer = initializers.get(bias_initializer)
-        self.kernel_regularizer = regularizers.get(kernel_regularizer)
-        self.bias_regularizer = regularizers.get(bias_regularizer)
-        self.kernel_constraint = constraints.get(kernel_constraint)
-        self.bias_constraint = constraints.get(bias_constraint)
 
     def build(self, input_shape):
         assert isinstance(input_shape, list)
@@ -167,36 +172,11 @@ def call(self, inputs):
 
         return output
 
-    def compute_output_shape(self, input_shape):
-        X_shape = input_shape[0]
-        A_shape = input_shape[1]
-        X_shape_out = X_shape[:-2] + (self.k,) + X_shape[-1:]
-        A_shape_out = A_shape[:-2] + (self.k, self.k)
-
-        output_shape = [X_shape_out, A_shape_out]
-
-        if len(input_shape) == 3:
-            I_shape_out = A_shape[:-2] + (self.k, )
-            output_shape.append(I_shape_out)
-
-        if self.return_mask:
-            S_shape_out = A_shape[:-1] + (self.k, )
-            output_shape.append(S_shape_out)
-
-        return output_shape
-
-    def get_config(self):
-        config = {
+    @property
+    def config(self):
+        return {
             'k': self.k,
             'mlp_hidden': self.mlp_hidden,
             'mlp_activation': self.mlp_activation,
-            'return_mask': self.return_mask,
-            'kernel_initializer': initializers.serialize(self.kernel_initializer),
-            'bias_initializer': initializers.serialize(self.bias_initializer),
-            'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
-            'bias_regularizer': regularizers.serialize(self.bias_regularizer),
-            'kernel_constraint': constraints.serialize(self.kernel_constraint),
-            'bias_constraint': constraints.serialize(self.bias_constraint)
-        }
-        base_config = super().get_config()
-        return dict(list(base_config.items()) + list(config.items()))
\ No newline at end of file
+            'return_mask': self.return_mask
+        }
\ No newline at end of file
diff --git a/spektral/layers/pooling/pool.py b/spektral/layers/pooling/pool.py
new file mode 100644
index 00000000..faa86ade
--- /dev/null
+++ b/spektral/layers/pooling/pool.py
@@ -0,0 +1,45 @@
+from tensorflow.keras.layers import Layer
+
+from spektral.utils.keras import is_layer_kwarg, deserialize_kwarg, is_keras_kwarg, serialize_kwarg
+
+
+class Pool(Layer):
+    r"""
+    A general class for pooling layers.
+
+    You can extend this class to create custom implementations of pooling layers.
+
+    Any extension of this class must implement the `call(self, inputs)` and
+    `config(self)` methods.
+
+    **Arguments**:
+
+    - ``**kwargs`: additional keyword arguments specific to Keras' Layers, like
+    regularizers, initializers, constraints, etc.
+    """
+    def __init__(self, **kwargs):
+        super().__init__(**{k: v for k, v in kwargs.items() if is_keras_kwarg(k)})
+        self.kwargs_keys = []
+        for key in kwargs:
+            if is_layer_kwarg(key):
+                attr = kwargs[key]
+                attr = deserialize_kwarg(key, attr)
+                self.kwargs_keys.append(key)
+                setattr(self, key, attr)
+
+    def build(self, input_shape):
+        self.built = True
+
+    def call(self, inputs):
+        raise NotImplementedError
+
+    def get_config(self):
+        base_config = super().get_config()
+        keras_config = {}
+        for key in self.kwargs_keys:
+            keras_config[key] = serialize_kwarg(key, getattr(self, key))
+        return {**base_config, **keras_config, **self.config}
+
+    @property
+    def config(self):
+        return {}
diff --git a/spektral/layers/pooling/sag_pool.py b/spektral/layers/pooling/sag_pool.py
index 2bf68c1d..b223c8f6 100644
--- a/spektral/layers/pooling/sag_pool.py
+++ b/spektral/layers/pooling/sag_pool.py
@@ -13,11 +13,11 @@ class SAGPool(TopKPool):
     **Mode**: single, disjoint.
 
     This layer computes the following operations:
-$$
-    \y = \textrm{GNN}(\A, \X); \;\;\;\;
-    \i = \textrm{rank}(\y, K); \;\;\;\;
-    \X' = (\X \odot \textrm{tanh}(\y))_\i; \;\;\;\;
-    \A' = \A_{\i, \i}
+    $$
+        \y = \textrm{GNN}(\A, \X); \;\;\;\;
+        \i = \textrm{rank}(\y, K); \;\;\;\;
+        \X' = (\X \odot \textrm{tanh}(\y))_\i; \;\;\;\;
+        \A' = \A_{\i, \i}
     $$
 
     where \( \textrm{rank}(\y, K) \) returns the indices of the top K values of
diff --git a/spektral/layers/pooling/topk_pool.py b/spektral/layers/pooling/topk_pool.py
index e706fa4c..62b9e10d 100644
--- a/spektral/layers/pooling/topk_pool.py
+++ b/spektral/layers/pooling/topk_pool.py
@@ -1,11 +1,11 @@
 import tensorflow as tf
-from tensorflow.keras import backend as K, initializers, regularizers, constraints
-from tensorflow.keras.layers import Layer
+from tensorflow.keras import backend as K
 
 from spektral.layers import ops
+from spektral.layers.pooling.pool import Pool
 
 
-class TopKPool(Layer):
+class TopKPool(Pool):
     r"""
     A gPool/Top-K layer from the papers
 
@@ -20,11 +20,11 @@ class TopKPool(Layer):
     **Mode**: single, disjoint.
 
     This layer computes the following operations:
-$$
-    \y = \frac{\X\p}{\|\p\|}; \;\;\;\;
-    \i = \textrm{rank}(\y, K); \;\;\;\;
-    \X' = (\X \odot \textrm{tanh}(\y))_\i; \;\;\;\;
-    \A' = \A_{\i, \i}
+    $$
+        \y = \frac{\X\p}{\|\p\|}; \;\;\;\;
+        \i = \textrm{rank}(\y, K); \;\;\;\;
+        \X' = (\X \odot \textrm{tanh}(\y))_\i; \;\;\;\;
+        \A' = \A_{\i, \i}
     $$
 
     where \( \textrm{rank}(\y, K) \) returns the indices of the top K values of
@@ -72,14 +72,14 @@ def __init__(self,
                  kernel_regularizer=None,
                  kernel_constraint=None,
                  **kwargs):
-        super().__init__(**kwargs)
+        super().__init__(kernel_initializer=kernel_initializer,
+                         kernel_regularizer=kernel_regularizer,
+                         kernel_constraint=kernel_constraint,
+                         **kwargs)
         self.ratio = ratio
         self.return_mask = return_mask
         self.sigmoid_gating = sigmoid_gating
         self.gating_op = K.sigmoid if self.sigmoid_gating else K.tanh
-        self.kernel_initializer = initializers.get(kernel_initializer)
-        self.kernel_regularizer = regularizers.get(kernel_regularizer)
-        self.kernel_constraint = constraints.get(kernel_constraint)
 
     def build(self, input_shape):
         self.F = input_shape[0][-1]
@@ -145,19 +145,10 @@ def call(self, inputs):
     def compute_scores(self, X, A, I):
         return K.dot(X, K.l2_normalize(self.kernel))
 
-    def compute_output_shape(self, input_shape):
-        output_shape = input_shape
-        if self.return_mask:
-            output_shape += [(input_shape[0][:-1])]
-        return output_shape
-
-    def get_config(self):
-        config = {
+    @property
+    def config(self):
+        return {
             'ratio': self.ratio,
             'return_mask': self.return_mask,
-            'kernel_initializer': initializers.serialize(self.kernel_initializer),
-            'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
-            'kernel_constraint': constraints.serialize(self.kernel_constraint),
-        }
-        base_config = super().get_config()
-        return dict(list(base_config.items()) + list(config.items()))
\ No newline at end of file
+            'sigmoid_gating': self.sigmoid_gating
+        }
\ No newline at end of file
diff --git a/tests/test_layers/test_pooling.py b/tests/test_layers/test_pooling.py
index 5e250fb0..6a5c3d06 100644
--- a/tests/test_layers/test_pooling.py
+++ b/tests/test_layers/test_pooling.py
@@ -4,6 +4,7 @@
 from tensorflow.keras import Input, Model
 
 from spektral.layers import TopKPool, MinCutPool, DiffPool, SAGPool
+from .test_convolutional import _test_get_config
 
 tf.keras.backend.set_floatx('float64')
 
@@ -148,14 +149,6 @@ def _test_disjoint_mode(layer, **kwargs):
     _check_output_and_model_output_shapes(output_shape, model.output_shape)
 
 
-def _test_get_config(layer, **kwargs):
-    if kwargs.get('edges'):
-        kwargs.pop('edges')
-    layer_instance = layer(**kwargs)
-    config = layer_instance.get_config()
-    assert layer(**config)
-
-
 def test_layers():
     for test in TESTS:
         for mode in test[MODES_K_]:

From 3a911933580ec79cc508677d93fbe0bb450b0d7d Mon Sep 17 00:00:00 2001
From: Daniele Grattarola <daniele.grattarola@gmail.com>
Date: Fri, 27 Nov 2020 13:50:08 +0100
Subject: [PATCH 52/57] Fix global_pool.py

---
 spektral/layers/pooling/global_pool.py | 9 +++++++++
 spektral/transforms/delaunay.py        | 2 +-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/spektral/layers/pooling/global_pool.py b/spektral/layers/pooling/global_pool.py
index e5742961..b415984e 100644
--- a/spektral/layers/pooling/global_pool.py
+++ b/spektral/layers/pooling/global_pool.py
@@ -37,6 +37,15 @@ def call(self, inputs):
         else:
             return self.batch_pooling_op(X, axis=-2, keepdims=(self.data_mode == 'single'))
 
+    def compute_output_shape(self, input_shape):
+        if self.data_mode == 'single':
+            return (1,) + input_shape[-1:]
+        elif self.data_mode == 'batch':
+            return input_shape[:-2] + input_shape[-1:]
+        else:
+            # Input shape is a list of shapes for X and I
+            return input_shape[0]
+
     def get_config(self):
         return super().get_config()
 
diff --git a/spektral/transforms/delaunay.py b/spektral/transforms/delaunay.py
index 2e82ddf4..0cdc023b 100644
--- a/spektral/transforms/delaunay.py
+++ b/spektral/transforms/delaunay.py
@@ -1,6 +1,6 @@
-from scipy.spatial import Delaunay as DelaunaySP
 import numpy as np
 import scipy.sparse as sp
+from scipy.spatial import Delaunay as DelaunaySP
 
 
 class Delaunay:

From 9eb90f0e34b605a52a91551594c6fd983b324362 Mon Sep 17 00:00:00 2001
From: Daniele Grattarola <daniele.grattarola@gmail.com>
Date: Fri, 27 Nov 2020 16:51:01 +0100
Subject: [PATCH 53/57] Fix bug in SingleLoader Re-write data modes tutorial

---
 docs/autogen.py                  |   9 +-
 docs/templates/data-modes.md     | 246 ++++++++++++++++++++-----------
 docs/templates/layers/pooling.md |   6 +
 spektral/data/graph.py           |   4 +-
 spektral/data/loaders.py         |   6 +-
 5 files changed, 182 insertions(+), 89 deletions(-)

diff --git a/docs/autogen.py b/docs/autogen.py
index 1247fd8e..3a07e837 100644
--- a/docs/autogen.py
+++ b/docs/autogen.py
@@ -64,7 +64,14 @@
             layers.DiffPool,
             layers.MinCutPool,
             layers.SAGPool,
-            layers.TopKPool,
+            layers.TopKPool
+        ]
+    },
+    {
+        'page': 'layers/pooling.md',
+        'functions': [],
+        'methods': [],
+        'classes': [
             layers.GlobalAvgPool,
             layers.GlobalMaxPool,
             layers.GlobalSumPool,
diff --git a/docs/templates/data-modes.md b/docs/templates/data-modes.md
index fbd9a69b..654d5d7a 100644
--- a/docs/templates/data-modes.md
+++ b/docs/templates/data-modes.md
@@ -1,142 +1,220 @@
-# Representing graphs
+# Data modes
 
-In Spektral, graphs are represented as matrices:
+Creating mini-batches of data can be tricky when the samples have different shapes. 
 
-- `A` is the adjacency matrix of shape `(N, N)`, where `N` is the number of nodes. `A` is a binary matrix where `A[i, j] = 1` if there is an edge between nodes `i` and `j`, and `0` otherwise. 
-- `X` is the node attributes matrix of shape `(N, F)`, where `F` is the size of the node attributes. 
+In traditional neural networks, we're used to stretching, cropping, or padding our data so that the inputs are standardized. 
+For instance, images of different sizes can be modified so that they fit into a tensor of shape `[batch, width, height, channels]`.
+Sequences can be padded so that they have shape `[batch, time, channels]`. And so on...
 
-Sometimes, we can also have edge attributes of size `S`, which we store in a matrix `E` of shape `(n_edges, S)` where each row is associated to a non-zero entry of `A`: assuming that `A` is a Scipy sparse matrix, we have that `E[i]` is the attribute associated to `A.data[i]`.
+With graphs, it's a bit different. 
 
-## Modes
+For instance, it is not that easy to define the meaning of "cropping" or "stretching" a graph, since these are all transformations that assume a "spatial closeness" of the pixels (which we don't have for graphs in general).
 
-Spektral supports four different ways of representing graphs or batches of graphs, which we refer to as **data modes**.
+Also, it's not always the case that we have many graphs in our datasets. Sometimes, we're just interested in classifying the nodes of one big graph. Sometimes, we may have one big graph but many instances of its node features (the classification of images is one such case: one grid, many instances of pixels values). 
 
-- In **single mode**, we have one graph with its adjacency matrix and attributes;
-- **Disjoint mode** is a special case of single mode, where the graph is the disjoint union of a set of graphs;
-- In **batch mode**, a set of graphs is represented by stacking their adjacency and node attributes matrices in higher order tensors of shape `(batch, N, ...)`;
-- In **mixed mode**, we have a single adjacency matrix shared by a set of graphs; the adjacency matrix will be in single mode, but the node attributes will be in batch mode. 
+To make Spektral work in all of these cases, and to account for the difficulties in dealing with graphs of different sizes, we introduce the concept of **data modes**.
 
-The difference between the four data modes can be easily seen in how `A`, `X`, and `E` have different shapes in each case:
+In Spektral, there are four of them:
 
-|Mode    | `A.shape`     | `X.shape`     | `E.shape`        |
-|:------:|:-------------:|:-------------:|:----------------:|
-|Single  |`(N, N)`       |`(N, F)`       |`(n_edges, S)`    |
-|Disjoint|`(N, N)`       |`(N, F)`       |`(n_edges, S)`    |
-|Batch   |`(batch, N, N)`|`(batch, N, F)`|`(batch, N, N, S)`|
-|Mixed   |`(N, N)`       |`(batch, N, F)`| N/A              |
+- In **single mode**, we have only one graph. Node classification tasks are usually in this mode. 
+- In **disjoint mode**, we represent a batch of graphs with their disjoint union. This gives us one big graph, similar to single mode, although with some differences (see below).
+- In **batch mode**, we zero-pad the graphs so that we can fit them into **dense** tensors of shape `[batch, nodes, ...]`. This can be more expensive, but makes it easier to interface with traditional NNs. 
+- In **mixed mode**, we have one adjacency matrix shared by many graphs. We keep the adjacency matrix in single mode (for performance, no need to duplicate it for each graph), and the node attributes in batch mode. 
 
+In all data modes, our goal is to represent one or more graphs by grouping their respective `x`, `a` and `e` matrices into single tensors `X`, `A`, and `E`. The shapes of these tensors in the different data modes are summarized in the table below. 
 
+|Mode      | `A.shape` | `X.shape` | `E.shape` |
+|:---------|:--|:--|:--|
+|`Single`  |`[nodes, nodes]`|`[nodes, n_feat]`|`[edges, e_feat]`|
+|`Disjoint`|`[nodes, nodes]`|`[nodes, n_feat]`|`[edges, e_feat]`|
+|`Batch`   |`[batch, n_max, n_max]`|`[batch, n_max, n_feat]`|`[batch, n_max, n_max, e_feat]`|
+|`Mixed`   |`[nodes, nodes]`|`[batch, nodes, n_feat]`| `n/a` |
+
+In the following sections we describe the four modes more into detail.
+In particular, we go over which [data `Loader`](/loaders/) to use in each case.
 
 ## Single mode
 
-<img src="https://danielegrattarola.github.io/spektral/img/single_mode.svg" width="50%"/>
+<img src="https://danielegrattarola.github.io/spektral/img/single_mode.svg" style="max-width: 400px; width: 100%;"/>
+
+In single mode we have only one graph in which: 
 
-In **single mode** the data describes a single graph where:
+- `A` is a matrix of shape `[nodes, nodes]`;
+- `X` is a matrix of shape `[nodes, n_feat]`;
+- `E` has shape `[edges, e_feat]` so that `E[i]` corresponds to the edge in `A[i // nodes, i % nodes]`.
 
-- `A` is a sparse matrix of shape `(N, N)`;
-- `X` is a matrix of shape `(N, F)`;
+A very common benchmark dataset in single mode is the Cora citation network. 
+We can load it with:
 
-When edge attributes are present, we represent them as a matrix `E` of shape `(n_edges, S)` so that there is a correspondence between `E[i]` and `A.data[i]`.
+```py
+>>> from spektral.datasets import Cora
+>>> dataset = Cora()
+>>> dataset
+Cora(n_graphs=1)
+```
 
-Three very popular datasets in this setting are the citation networks: Cora, Citeseer, and Pubmed. To load a citation network, you can use the built-in loader:
+As expected, we have only one graph: 
 
 ```py
->>> from spektral.datasets import citation
->>> A, X, _, _, _, _ = citation.load_data('cora')
-Loading cora dataset
->>> A.shape
+>>> dataset[0]
+Graph(n_nodes=2708, n_node_features=1433, n_edge_features=None, n_labels=7)
+
+>>> dataset[0].a.shape
 (2708, 2708)
->>> X.shape
+
+>>> dataset[0].x.shape
 (2708, 1433)
 ```
 
+When training a GNN in single mode, we can use a `SingleLoader` that will extract the characteristic matrices from the graph and return a `tf.data.Dataset` to feed to our model:
+
+```py
+>>> from spektral.data import SingleLoader
+>>> loader = SingleLoader(dataset)
+>>> loader.load()
+<RepeatDataset shapes: (((2708, 1433), (2708, 2708)), (2708, 7)), types: ((tf.float32, tf.int64), tf.int32)>
+```
+
 ## Disjoint mode
 
-<img src="https://danielegrattarola.github.io/spektral/img/disjoint_mode.svg" width="50%"/>
+<img src="https://danielegrattarola.github.io/spektral/img/disjoint_mode.svg" style="max-width: 400px; width: 100%;"/>
+
+In disjoint mode we represent a set of graphs as a single graph, their "disjoint union", where:
 
-**Disjoint mode** is a smart way of representing a set of graphs as a single graph.
-In particular, the disjoint union of a batch is a graph where 
+- `A` is a sparse block diagonal matrix where each block is the adjacency matrix `a_i` of the i-th graph.
+- `X` is obtained by stacking the node attributes `x_i`;
+- `E` is also obtained by stacking the edges `e_i`.
 
-- `A` is a sparse block diagonal matrix, where each block is the adjacency matrix `A_i` of the i-th graph;
-- `X` is obtained by stacking the node attributes matrices of the graphs.
+The shapes of the three matrices are the same as single mode, but `nodes` is the number of all the nodes in the set of graphs. 
 
-When edge attributes are present, we represent them as a matrix `E` of shape `(n_edges, S)` so that there is a correspondence between `E[i]` and `A.data[i]`.
+In order to keep track of the different graphs in the disjoint union, we use an additional array of zero-based indices `I` that identify which nodes belong to which graph. 
+For instance: if node 8 belongs to the third graph, we will have `I[8] == 2`. <br>
+In the example above, color blue represents 0, green is 1, and orange is 2
 
-In order to keep track of different graphs in the disjoint union, we use an additional array of integers `I` that identifies which nodes belong to the same graph.  
-For convolutional layers, disjoint mode is indistinguishable from single mode because it is not possible to exchange messages between the components of the graph, so `I` is not needed to compute the output.  
-Pooling layers, on the other hand, require `I` to know which nodes can be pooled together. 
-Hierarchical pooling layers will return a reduced version of `I` along with the reduced graphs. Global pooling layers will consume `I` and reduce the graphs to single vectors. 
+In convolutional layers, disjoint mode is indistinguishable from single mode because it is not possible to exchange messages between the disjoint components of the graph, so `I` is not needed to compute the output.  
+Pooling layers, on the other hand, require `I` to know which nodes can be pooled together.
 
-Utilities for creating the disjoint union of a list of graphs are provided in `spektral.utils.data`:
+
+Let's load a dataset with many small graphs and have a look at the first three:
 
 ```py
->>> from spektral.utils.data import to_disjoint
->>> A_list = [np.ones((2, 2)), np.ones((3, 3))]  # One graph has 2 nodes, the other has 3
->>> X_list = [np.random.randn(2, 4), np.random.randn(3, 4)]  # F = 4
->>> X, A, I = to_disjoint(X_list, A_list)
->>> X.shape
-(5, 4)
->>> A.shape
-(5, 5)
->>> A.toarray()
-array([[1., 1., 0., 0., 0.],
-       [1., 1., 0., 0., 0.],
-       [0., 0., 1., 1., 1.],
-       [0., 0., 1., 1., 1.],
-       [0., 0., 1., 1., 1.]])
->>> I
-array([0, 0, 1, 1, 1])
+>>> from spektral.datasets import TUDataset
+>>> dataset = TUDataset('PROTEINS')
+Successfully loaded PROTEINS.
+
+>>> dataset = dataset[:3]
+>>> dataset[0]
+Graph(n_nodes=42, n_node_features=4, n_edge_features=None, n_labels=2)
+
+>>> dataset[1]
+Graph(n_nodes=27, n_node_features=4, n_edge_features=None, n_labels=2)
+
+>>> dataset[2]
+Graph(n_nodes=10, n_node_features=4, n_edge_features=None, n_labels=2)
 ```
 
+To create batches in disjoint mode, we can use a `DisjointLoader`:
+
+```py
+>>> from spektral.data import DisjointLoader
+>>> loader = DisjointLoader(dataset, batch_size=3)
+```
+
+Since Loaders are effectively generators, we can inspect the first batch that the loader will compute for us by calling `__next__()`:
+
+```py
+>>> batch = loader.__next__()
+>>> inputs, target = batch
+>>> x, a, i = inputs
+>>> x.shape
+(79, 4)  # 79 == 42 + 27 + 10
+
+>>> a.shape
+(79, 79)
+
+>>> i.shape
+(79, )
+```
+
+Note that, since we don't have edge attributes in our datset, the loader did not create the `E` matrix.
+
+
+
 ## Batch mode
 
-<img src="https://danielegrattarola.github.io/spektral/img/batch_mode.svg" width="50%"/>
+<img src="https://danielegrattarola.github.io/spektral/img/batch_mode.svg" style="max-width: 400px; width: 100%;"/>
 
-In **batch mode**, graphs have the same number of nodes and are stacked in tensors of shape `(batch, N, ...)`. 
-Due to the general lack of support for sparse higher-order tensors both in Scipy and TensorFlow, `A` and `X` will be dense tensors.
+In batch mode, graphs are zero-padded so that they fit into tensors of shape `[batch, N, ...]`. 
+Due to the general lack of support for sparse higher-order tensors both in Scipy and TensorFlow, `X`, `A`, and `E` will be dense tensors:
 
-In this case, edge attributes must also be reshaped and made dense, so that `E` has shape `(batch, N, N, S)` (the attributes of non-existing edges are usually all zeros).
+- `A` has shape `[batch, nodes, nodes]`;
+- `X` has shape `[batch, nodes, n_feat]`;
+- `E` has shape `[batch, nodes, nodes, e_feat]` (the attributes of non-existing edges are all zeros).
 
-Note that if the graphs have variable number of nodes, the matrices must be zero-padded so that they have the same `N`.
-If you don't want to zero-pad the graphs or work with dense inputs, it is better to work in [disjoint mode](https://danielegrattarola.github.io/spektral/data/#disjoint-mode) instead.
+If the graphs have a variable number of nodes, `nodes` will be the size of the biggest graph in the batch.
 
-The advantage of batch mode is that it is more intuitive and it allows to use the training loop of `tf.keras` without any modifications. Also, some pooling layers like `DiffPool` and `MinCutPool` will only work in batch mode. 
+If you don't want to zero-pad the graphs or work with dense inputs, it is better to use [disjoint mode](#disjoint-mode) instead.
 
-For example, the QM9 dataset of small molecules will be loaded in batch mode by default:
+However, note that some pooling layers like `DiffPool` and `MinCutPool` will only work in batch mode. 
+
+Let's re-use the dataset from the example above. We can use a `BatchLoader` as follows: 
 
 ```py
->>> from spektral.datasets import qm9
->>> A, X, E, y = qm9.load_data()
-Loading QM9 dataset.
-Reading SDF
->>> A.shape
-(133885, 9, 9)
->>> X.shape
-(133885, 9, 6)
->>> E.shape
-(133885, 9, 9, 5)
+>>> from spektral.data import BatchLoader
+>>> loader = BatchLoader(dataset, batch_size=3)
+>>> inputs, target = loader.__next__()
+
+>>> inputs[0].shape
+(3, 42, 4)
+
+>>> inputs[1].shape
+(3, 42, 42)
 ```
 
+In this case, the loader only created two inputs because we don't need the indices `I`. 
+Also note that the batch was padded so that all graphs have 42 nodes, which is the size of the biggest graph out of the three.
+
+The `BatchLoader` zero-pads each batch independently of the others, so that we don't waste memory. If you want to remove the overhead of padding each batch, you can use a `PackedBatchLoader` which will pre-pad all graphs before yielding the batches. Of course, this means that all graphs will have the same number of nodes as the biggest graph in the dataset (and not just in the batch).
+
+
 ## Mixed mode
 
-<img src="https://danielegrattarola.github.io/spektral/img/mixed_mode.svg" width="50%"/>
+<img src="https://danielegrattarola.github.io/spektral/img/mixed_mode.svg" style="max-width: 400px; width: 100%;"/>
 
-In **mixed mode** we consider a single adjacency matrix that acts as the support for different node attributes (also sometimes called "signals").
+In mixed mode we have a single graph that acts as the support for different node attributes (also sometimes called "graph signals").
 
 In this case we have that: 
 
-- `A` is a sparse matrix of shape `(N, N)`;
-- `X` is a tensor in batch mode, of shape `(batch, N, F)`;
+- `A` is a matrix of shape `[node, node]`;
+- `X` is a tensor in batch mode, of shape `[batch, node, n_feat)`;
 
 Currently, there are no layers in Spektral that support mixed mode and edge attributes. 
 
 An example of a mixed mode dataset is the MNIST random grid ([Defferrard et al., 2016](https://arxiv.org/abs/1606.09375)):
 
 ```py
->>> from spektral.datasets import mnist
->>> X_tr, y_tr, X_va, y_va, X_te, y_te, A = mnist.load_data()
->>> A.shape
-(784, 784)
->>> X_tr.shape
-(50000, 784, 1)
+>>> from spektral.datasets import MNIST
+>>> dataset = MNIST()
+>>> dataset
+MNIST(n_graphs=70000)
+```
+
+Mixed-mode datasets have a special `a` attribute that stores the adjacency matrix, while the proper graphs that make up the dataset only have node features:
+
+```py
+>>>dataset.a
+<784x784 sparse matrix of type '<class 'numpy.float64'>'
+    with 6396 stored elements in Compressed Sparse Row format>
+
+>>> dataset[0]
+Graph(n_nodes=784, n_node_features=1, n_edge_features=None, n_labels=1)
+
+>>>dataset[0].a
+# None
 ```
+
+For this reason, a `PackedBatchLoader` will work perfectly for our mixed mode datasets (a `BatchLoader` will work perfectly, as well, but it will have slightly more Python overhead).
+
+Mixed mode requires a bit more work than the other three modes. In particular, it is not possible to use `loader.load()` to train a model in this mode. 
+
+Have a look at [this example](https://github.com/danielegrattarola/spektral/blob/master/examples/other/graph_signal_classification_mnist.py) to see how to train a GNN in mixed mode.
diff --git a/docs/templates/layers/pooling.md b/docs/templates/layers/pooling.md
index ece8f1ed..e29dccf0 100644
--- a/docs/templates/layers/pooling.md
+++ b/docs/templates/layers/pooling.md
@@ -7,3 +7,9 @@ See [the convolutional layers page](/layers/convolution) for the notation.
 ---
 
 {{autogenerated}}
+
+---
+
+## Global pooling layers
+
+{{autogenerated}}
\ No newline at end of file
diff --git a/spektral/data/graph.py b/spektral/data/graph.py
index 32dc4d3c..75fe9d01 100644
--- a/spektral/data/graph.py
+++ b/spektral/data/graph.py
@@ -79,8 +79,8 @@ def __contains__(self, key):
         return key in self.keys
 
     def __repr__(self):
-        return 'Graph(n_nodes={}, n_node_features={}, n_edge_features={}, y={})'\
-               .format(self.n_nodes, self.n_node_features, self.n_edge_features, self.y)
+        return 'Graph(n_nodes={}, n_node_features={}, n_edge_features={}, n_labels={})'\
+               .format(self.n_nodes, self.n_node_features, self.n_edge_features, self.n_labels)
 
     @property
     def n_nodes(self):
diff --git a/spektral/data/loaders.py b/spektral/data/loaders.py
index f0f1b941..13be610b 100644
--- a/spektral/data/loaders.py
+++ b/spektral/data/loaders.py
@@ -201,15 +201,17 @@ def __init__(self, dataset, epochs=None, sample_weights=None):
     def collate(self, batch):
         graph = batch[0]
         output = graph.numpy()
-        output = [output[:-1], output[-1]]
 
         # Sparse matrices to SparseTensors
+        output = list(output)
         for i in range(len(output)):
             if sp.issparse(output[i]):
                 output[i] = sp_matrix_to_sp_tensor(output[i])
+        output = tuple(output)
 
+        output = (output[:-1], output[-1])
         if self.sample_weights is not None:
-            output += [self.sample_weights]
+            output += (self.sample_weights, )
         return tuple(output)
 
     def load(self):

From e66207727c6ca3182fdd70f75b99d294c6cc9ea3 Mon Sep 17 00:00:00 2001
From: Daniele Grattarola <daniele.grattarola@gmail.com>
Date: Fri, 27 Nov 2020 17:04:12 +0100
Subject: [PATCH 54/57] Docs

---
 docs/templates/data-modes.md                | 2 +-
 docs/templates/getting-started.md           | 3 ++-
 spektral/layers/convolutional/appnp_conv.py | 4 ++--
 spektral/layers/convolutional/ecc_conv.py   | 8 +++-----
 spektral/layers/convolutional/gcs_conv.py   | 4 ++--
 5 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/docs/templates/data-modes.md b/docs/templates/data-modes.md
index 654d5d7a..f83b0171 100644
--- a/docs/templates/data-modes.md
+++ b/docs/templates/data-modes.md
@@ -2,7 +2,7 @@
 
 Creating mini-batches of data can be tricky when the samples have different shapes. 
 
-In traditional neural networks, we're used to stretching, cropping, or padding our data so that the inputs are standardized. 
+In traditional neural networks, we're used to stretching, cropping, or padding our data so that all inputs to our models are standardized. 
 For instance, images of different sizes can be modified so that they fit into a tensor of shape `[batch, width, height, channels]`.
 Sequences can be padded so that they have shape `[batch, time, channels]`. And so on...
 
diff --git a/docs/templates/getting-started.md b/docs/templates/getting-started.md
index c3bde713..587b8d69 100644
--- a/docs/templates/getting-started.md
+++ b/docs/templates/getting-started.md
@@ -107,7 +107,8 @@ Since this is a fairly common operation, Spektral has a transform to do it:
 >>> dataset.apply(GCNFilter())
 ```
 
-Many layers will require you to do some form of preprocessing. If you don't want to go back to the literature every time, you can use the handy [`LayerPreprocess` transform](/transforms/#layerpreprocess).
+Many layers will require you to do some form of preprocessing. If you don't want to go back to the literature every time, every convolutional layer in Spektral has a `preprocess(a)` method that you can use to transform the adjacency matrix as needed. <br>
+Have a look at the handy [`LayerPreprocess` transform](/transforms/#layerpreprocess).
 
 
 ## Creating a GNN
diff --git a/spektral/layers/convolutional/appnp_conv.py b/spektral/layers/convolutional/appnp_conv.py
index 334044b1..7e1e0890 100644
--- a/spektral/layers/convolutional/appnp_conv.py
+++ b/spektral/layers/convolutional/appnp_conv.py
@@ -14,6 +14,8 @@ class APPNPConv(Conv):
     > [Predict then Propagate: Graph Neural Networks meet Personalized PageRank](https://arxiv.org/abs/1810.05997)<br>
     > Johannes Klicpera et al.
 
+    **Mode**: single, disjoint, mixed, batch.
+
     This layer computes:
     $$
         \Z^{(0)} = \textrm{MLP}(\X); \\
@@ -23,8 +25,6 @@ class APPNPConv(Conv):
     where \(\alpha\) is the teleport probability, \(\textrm{MLP}\) is a
     multi-layer perceptron, and \(K\) is defined by the `propagations` argument.
 
-    **Mode**: single, disjoint, mixed, batch.
-
     **Input**
 
     - Node features of shape `([batch], n_nodes, n_node_features)`;
diff --git a/spektral/layers/convolutional/ecc_conv.py b/spektral/layers/convolutional/ecc_conv.py
index 7a196e8f..35377582 100644
--- a/spektral/layers/convolutional/ecc_conv.py
+++ b/spektral/layers/convolutional/ecc_conv.py
@@ -17,11 +17,6 @@ class ECCConv(Conv):
 
     **Mode**: single, disjoint, batch.
 
-    **Notes**:
-
-        - In single mode, if the adjacency matrix is dense it will be converted
-        to a SparseTensor automatically (which is an expensive operation).
-
     This layer computes:
     $$
         \x_i' = \x_{i} \W_{\textrm{root}} + \sum\limits_{j \in \mathcal{N}(i)}
@@ -30,6 +25,9 @@ class ECCConv(Conv):
     where \(\textrm{MLP}\) is a multi-layer perceptron that outputs an
     edge-specific weight as a function of edge attributes.
 
+    **Note:** In single mode, if the adjacency matrix is dense it will be
+    converted to a SparseTensor automatically (which is an expensive operation).
+
     **Input**
 
     - Node features of shape `([batch], n_nodes, n_node_features)`;
diff --git a/spektral/layers/convolutional/gcs_conv.py b/spektral/layers/convolutional/gcs_conv.py
index cf301ed9..13e98d3d 100644
--- a/spektral/layers/convolutional/gcs_conv.py
+++ b/spektral/layers/convolutional/gcs_conv.py
@@ -7,7 +7,7 @@
 
 class GCSConv(Conv):
     r"""
-    A simple convolutional layer with a skip connection.
+    A `GraphConv` layer with a trainable skip connection.
 
     **Mode**: single, disjoint, mixed, batch.
 
@@ -15,7 +15,7 @@ class GCSConv(Conv):
     $$
         \Z' = \D^{-1/2} \A \D^{-1/2} \X \W_1 + \X \W_2 + \b
     $$
-    where \( \A \) does not have self-loops (unlike in GraphConv).
+    where \( \A \) does not have self-loops.
 
     **Input**
 

From f8a8bb4a56ffe2b9ce0254857d6ba651a3b73e2f Mon Sep 17 00:00:00 2001
From: Daniele Grattarola <daniele.grattarola@gmail.com>
Date: Mon, 30 Nov 2020 11:55:39 +0100
Subject: [PATCH 55/57] Add Creating dataset and creating layer tutorials
 Update docs

---
 README.md                                     |  14 +--
 docs/mkdocs.yml                               |  10 +-
 docs/templates/creating-dataset.md            | 105 +++++++++++++++-
 docs/templates/creating-layer.md              | 119 +++++++++++++++++-
 docs/templates/data-modes.md                  |   8 +-
 docs/templates/examples.md                    |   2 +-
 docs/templates/external.md                    |   2 +-
 docs/templates/getting-started.md             |   4 +-
 spektral/data/dataset.py                      |  20 +--
 spektral/layers/base.py                       |   4 +-
 spektral/layers/convolutional/agnn_conv.py    |   4 +-
 spektral/layers/convolutional/appnp_conv.py   |   2 +-
 spektral/layers/convolutional/arma_conv.py    |   4 +-
 spektral/layers/convolutional/cheb_conv.py    |   2 +-
 spektral/layers/convolutional/crystal_conv.py |   2 +-
 spektral/layers/convolutional/ecc_conv.py     |   2 +-
 spektral/layers/convolutional/edge_conv.py    |   2 +-
 spektral/layers/convolutional/gat_conv.py     |   2 +-
 .../layers/convolutional/gated_graph_conv.py  |   2 +-
 spektral/layers/convolutional/gcn_conv.py     |   2 +-
 spektral/layers/convolutional/gcs_conv.py     |   2 +-
 spektral/layers/convolutional/general_conv.py |   2 +-
 spektral/layers/convolutional/gin_conv.py     |   2 +-
 .../layers/convolutional/graphsage_conv.py    |   2 +-
 .../layers/convolutional/message_passing.py   |   7 +-
 spektral/layers/convolutional/tag_conv.py     |   2 +-
 26 files changed, 275 insertions(+), 54 deletions(-)

diff --git a/README.md b/README.md
index d8ee50df..7bc3c57c 100644
--- a/README.md
+++ b/README.md
@@ -1,10 +1,10 @@
-<img src="https://danielegrattarola.github.io/spektral/img/logo_dark.svg" width="50%"/>
+<img src="https://danielegrattarola.github.io/spektral/img/logo_dark.svg" style="max-width: 400px; width: 100%;"/>
 
 # Welcome to Spektral
 Spektral is a Python library for graph deep learning, based on the Keras API and TensorFlow 2.
 The main goal of this project is to provide a simple but flexible framework for creating graph neural networks (GNNs).
 
-You can use Spektral for classifying the userss of a social network, predicting molecular properties, generating new graphs with GANs, clustering nodes, predicting links, and any other task where data is described by graphs. 
+You can use Spektral for classifying the users of a social network, predicting molecular properties, generating new graphs with GANs, clustering nodes, predicting links, and any other task where data is described by graphs. 
 
 Spektral implements some of the most popular layers for graph deep learning, including: 
 
@@ -30,7 +30,7 @@ You can also find [pooling layers](https://graphneural.network/layers/pooling/),
 - [Global gated attention pooling](https://arxiv.org/abs/1511.05493)
 - [SortPool](https://www.cse.wustl.edu/~muhan/papers/AAAI_2018_DGCNN.pdf)
 
-Spektral also includes lots of utilities for for representing, manipulating, and transforming graphs in your graph deep learning projects.
+Spektral also includes lots of utilities for representing, manipulating, and transforming graphs in your graph deep learning projects.
 
 See how to [get started with Spektral](https://graphneural.network/getting-started/) and have a look at the [examples](https://danielegrattarola.github.io/spektral/examples/) for some templates.
 
@@ -75,10 +75,10 @@ Your models will continue to work in exactly the same way.
 
 This is a summary of the new features and changes: 
 
-- The new `Graph` and `Dataset` containers standardize the way in which Spektral handles data. 
-**This does not impact your models**, but makes it easier to use your own data in Spektral.
+- The new `Graph` and `Dataset` containers standardize how Spektral handles data. 
+**This does not impact your models**, but makes it easier to use your data in Spektral.
 - The new `Loader` class hides away all the complexity of creating graph batches. 
-Whether you want to write your own training loop or use Keras' famous model-dot-fit approach, you only need to worry about the training logic and not the data. 
+Whether you want to write a custom training loop or use Keras' famous model-dot-fit approach, you only need to worry about the training logic and not the data. 
 - The new `transforms` module implements a wide variety of common operations on graphs, that you can now `apply()` to your datasets. 
 - The new `GeneralConv` and `GeneralGNN` classes let you build models that are, well... general. Using state-of-the-art results from recent literature means that you don't need to worry about which layers or architecture to choose. The defaults will work well everywhere. 
 - New datasets: QM7 and ModelNet10/40, and a new wrapper for OGB datasets. 
@@ -88,7 +88,7 @@ Whether you want to write your own training loop or use Keras' famous model-dot-
 
 
 ## Contributing
-Spektral is an open source project available [on Github](https://github.com/danielegrattarola/spektral), and contributions of all types are welcome. 
+Spektral is an open-source project available [on Github](https://github.com/danielegrattarola/spektral), and contributions of all types are welcome. 
 Feel free to open a pull request if you have something interesting that you want to add to the framework.
 
 The contribution guidelines are available [here](https://github.com/danielegrattarola/spektral/blob/master/CONTRIBUTING.md) and a list of feature requests is available [here](https://github.com/danielegrattarola/spektral/projects/1).
diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml
index 7931ea7b..fa7c2499 100644
--- a/docs/mkdocs.yml
+++ b/docs/mkdocs.yml
@@ -31,13 +31,13 @@ nav:
 - Tutorials:
     - Getting started: getting-started.md
     - Data modes: data-modes.md
-    - Creating a Dataset: creating-dataset.md
-    - Creating a Layer: creating-layer.md
+    - Creating a dataset: creating-dataset.md
+    - Creating a layer: creating-layer.md
     - Examples: examples.md
 - Layers:
-    - Convolutional Layers: layers/convolution.md
-    - Pooling Layers: layers/pooling.md
-    - Base Layers: layers/base.md
+    - Convolutional layers: layers/convolution.md
+    - Pooling layers: layers/pooling.md
+    - Base layers: layers/base.md
     - Models: models.md
 - Data:
     - Containers: data.md
diff --git a/docs/templates/creating-dataset.md b/docs/templates/creating-dataset.md
index eb03b22a..d2e5c50d 100644
--- a/docs/templates/creating-dataset.md
+++ b/docs/templates/creating-dataset.md
@@ -1 +1,104 @@
-# Creating a Custom Dataset
\ No newline at end of file
+# Creating a Custom Dataset
+
+The `Dataset` class is a new feature of Spektral 1.0 that standardizes how graph datasets are represented in Spektral. 
+
+In this tutorial, we'll go over a simple example to create a custom dataset with your data. 
+
+This is also useful if you want to share you datasets publicly or include them as part of Spektral. 
+
+## Essential information
+
+There are just a few things to know about datasets before you can create your own.
+
+You create a dataset by subclassing the `spektral.data.Dataset` class. 
+
+The core of datasets is the `read()` method. This is called at every instantiation of the dataset and must return a list of `spektral.data.Graph`.
+It doesn't matter if you read the data from a file or create it on the fly, this is simply where the dataset is loaded in memory. 
+
+All datasets have a `path` property that represents the directory in which the data is stored. This defaults to `~/.spektral/datasets/[ClassName]`.
+You can ignore it if you want.<br>
+However, each time you instantiate a Dataset it will check whether `path` exists. If it doesn't, the `download()` method will be called.
+
+You can use `download()` to define any additional operations that are needed to save your raw data to disk. The method will be called **before** `read()`.
+
+Both `read()` and `download()` are called by the dataset's `__init__()` method. If you need to override the initialization of your dataset, make sure to call `super().__init__()` somewhere in your implementation (usually as the last line).
+
+## Example
+
+This is a simple example that shows how to create a custom dataset with five random graphs. We pretend that the data comes from an online source so that we can show how to use `download()`. 
+
+We start by overriding the `__init__()` method to store some custom parameters of the dataset: 
+
+```py
+class MyDataset(Dataset):
+    """
+    A dataset of five random graphs.
+    """
+    def __init__(self, nodes, feats, **kwargs):
+        self.nodes = nodes
+        self.feats = feats
+
+        super().__init__(**kwargs)
+```
+
+Remember to call `super().__init__(**kwargs)` as the last line.
+
+Then, we simulate downloading the data from the web. Since this method gets called if `path` does not exist on the system, it makes sense to create the corresponding directory now:
+
+```py
+def download(self):
+    data = ...  # Download from somewhere
+
+    # Create the directory
+    os.mkdir(self.path)
+
+    # Write the data to file
+    for i in range(5):
+        x = rand(self.nodes, self.feats)
+        a = randint(0, 2, (self.nodes, self.nodes))
+        y = randint(0, 2)
+
+        filename = os.path.join(self.path, f'graph_{i}')
+        np.savez(filename, x=x, a=a, y=y)
+```
+
+Finally, we implement the `read()` method to return a list of `Graph` objects:
+
+```py
+def read(self):
+    # We must return a list of Graph objects
+    output = []
+    
+    for i in range(5):
+        data = np.load(os.path.join(self.path, f'graph_{i}.npz'))
+        output.append(
+            Graph(x=data['x'], a=data['a'], y=data['y'])
+        )
+
+    return output
+```
+
+We can now instantiate our dataset, which will "download" our data and read it into memory: 
+
+```
+>>> dataset = MyDataset(3, 2)
+>>> dataset
+MyDataset(n_graphs=5)
+```
+
+We can see that our graphs were saved to file: 
+
+```sh
+$ ls ~/.spektral/datasets/MyDataset/
+graph_0.npz  graph_1.npz  graph_2.npz  graph_3.npz  graph_4.npz
+```
+
+so the next time we create `MyDataset` it will read from the files we have saved. 
+
+---
+
+You can now use your custom dataset however you like. [Loaders](/loaders) will work, as well as [transforms](/transforms) and all other features described in the [documentation](/data/#dataset).
+
+Remember that, if you want, you're free to store your data as you prefer. Datasets in Spektral are just there to simplify your workflow, but the library is still designed according to Keras' principle of not getting in your way. If you want to manipulate your data differently, your GNNs will still work. 
+
+You can also see [this script](https://github.com/danielegrattarola/spektral/blob/master/examples/graph_prediction/custom_dataset.py) for another example on how to create and use a custom dataset.
diff --git a/docs/templates/creating-layer.md b/docs/templates/creating-layer.md
index 024972c2..879be72b 100644
--- a/docs/templates/creating-layer.md
+++ b/docs/templates/creating-layer.md
@@ -1 +1,118 @@
-# Creating a Message-Passing Layer
\ No newline at end of file
+# Creating a Message-Passing Layer
+
+In this tutorial we go over the `MessagePassing` interface for creating GNN layers. 
+
+This is a very flexible class that is based on three main functions: message, aggregate and update.
+By overriding these methods, you can define the behaviour of your own layers. 
+
+## Essential information
+
+The `MessagePassing` layer can be subclassed to create layers that work in [single and disjoint mode](/data-modes) using sparse adjacency matrices. This ensures that your layers will work for both node-level and graph-level learning while being very computationally efficient.
+
+The functionality of these layers is defined by the `message`, `aggregate` and `update` methods, and is summarized as follows: 
+
+```python
+
+x_out[i] = update(x[i], aggregate([message(x[j]) for j in neighbours(i)]))
+
+```
+
+The `message` function computes a transformation of the neighbours of each node. The `aggregate` function aggregates the messages in a way that is independent of the order in which the messages are processed (like a sum, an average, the maximum, etc). The `update` function takes the aggregated messages from the neighbours and decides how to transform each node.
+
+This message-passing scheme is computed by calling the `propagate` method of the class, which will return the updated node features (`x_out`).
+
+## Example
+
+In this example we will implement a graph convolutional network ([Kipf & and Welling, 2016](https://arxiv.org/abs/1609.02907)) using the MessagePassing interface. 
+
+First, let's add some trainable parameters when creating the layer: 
+
+```py
+class GCN(MessagePassing):
+    def __init__(self, n_out, activation):
+        super().__init__(activation=activation)
+        self.n_out = n_out
+
+    def build(self, input_shape):
+        n_in = input_shape[0][-1]
+        self.weights = self.add_weight(shape=(n_in, self.n_out))
+```
+
+Note that the Keras keyword `activation` was passed to the constructor of the superclass. This can be done with any Keras keyword (like regularizers, constraints, etc) and the layer will process them automatically. 
+
+By default, the `call` method of MessagePassing layers will only call `propagate`. We modify it so that it also updates the node features before starting the propagation:
+
+```py
+def call(self, inputs):
+    x, a = inputs
+
+    # Update node features
+    x = tf.matmul(x, self.weights)
+
+    return self.propagate(x=x, a=a)
+```
+
+Then, we implement the `message` function, which only needs to get the neighbours for each node.<br>
+The `get_i` and `get_j` built-in methods can be used to automatically access either side of the edges \(i \leftarrow j\). For instance, we can use `get_j` to access the node features `x[j]` of all neighbors `j`.
+
+If you need direct access to the edge indices, you can use the `index_i` and `index_j` attributes directly.
+
+In this case, we only need to get the neighbors' features and return them: 
+
+```py
+def message(self, x):
+    # Get the node features of all neighbors
+    return self.get_j(x)
+```
+
+Then, we define an aggregation function for the messages. We can use a simple average of the nodes:
+
+```py
+from spektral.layers.ops import scatter_mean
+
+def aggregate(self, messages):
+    return scatter_mean(messages, self.index_i, self.n_nodes)
+```
+
+**Note**: `n_nodes` is computed dynamically at the start of propagation, exactly like `index_i`.
+
+Since there are a few common aggregation functions that are often used in the literature, you can also skip the implementation of this method and simply pass a special keyword to the `__init__()` method of the superclass:
+
+```py
+def __init__(self):
+    # Equivalent to the above implementation of aggregate
+    super().__init__(aggregate='mean')
+```
+
+Finally, we can use the `update` method to apply the activation function: 
+
+```py
+def update(self, embeddings):
+    return self.activation(embeddings)
+```
+
+This is enough to get started with building your own layers in Spektral. 
+
+## Notes
+
+An important feature of the MessagePassing class is that any extra keyword argument given to `propagate`, will be matched to the signature of `message`, `aggregate` and `update` and forwarded to those functions if a match is found. 
+
+For example, we can call:
+
+```py
+propagate(x=x, a=a, extra_tensor=extra_tensor)
+```
+
+and define the message function as: 
+
+```py
+def message(self, x, extra_tensor=None):
+    ...  # Do something with extra_tensor
+```
+
+
+Finally, we already noted that MessagePassing layers only support single and disjoint mode, and they also require that the adjacency matrix is a SparseTensor. 
+
+If you need more control on your layers, you can have a look at `spektral.layers.Conv` for a stripped-down class that performs no checks on the inputs and only implements some essential features like keyword parsing. 
+
+For example, `spektral.layers.GCNConv` implements the same GCN layer that we just saw, using the `Conv` class so that it can provide support for batch and mixed mode, as well as dense adjacency matrices. 
diff --git a/docs/templates/data-modes.md b/docs/templates/data-modes.md
index f83b0171..379baf7d 100644
--- a/docs/templates/data-modes.md
+++ b/docs/templates/data-modes.md
@@ -27,7 +27,7 @@ In all data modes, our goal is to represent one or more graphs by grouping their
 |:---------|:--|:--|:--|
 |`Single`  |`[nodes, nodes]`|`[nodes, n_feat]`|`[edges, e_feat]`|
 |`Disjoint`|`[nodes, nodes]`|`[nodes, n_feat]`|`[edges, e_feat]`|
-|`Batch`   |`[batch, n_max, n_max]`|`[batch, n_max, n_feat]`|`[batch, n_max, n_max, e_feat]`|
+|`Batch`   |`[batch, nodes, nodes]`|`[batch, nodes, nodes]`|`[batch, nodes, nodes, e_feat]`|
 |`Mixed`   |`[nodes, nodes]`|`[batch, nodes, n_feat]`| `n/a` |
 
 In the following sections we describe the four modes more into detail.
@@ -87,7 +87,7 @@ In disjoint mode we represent a set of graphs as a single graph, their "disjoint
 
 The shapes of the three matrices are the same as single mode, but `nodes` is the number of all the nodes in the set of graphs. 
 
-In order to keep track of the different graphs in the disjoint union, we use an additional array of zero-based indices `I` that identify which nodes belong to which graph. 
+To keep track of the different graphs in the disjoint union, we use an additional array of zero-based indices `I` that identify which nodes belong to which graph. 
 For instance: if node 8 belongs to the third graph, we will have `I[8] == 2`. <br>
 In the example above, color blue represents 0, green is 1, and orange is 2
 
@@ -136,7 +136,7 @@ Since Loaders are effectively generators, we can inspect the first batch that th
 (79, )
 ```
 
-Note that, since we don't have edge attributes in our datset, the loader did not create the `E` matrix.
+Note that, since we don't have edge attributes in our dataset, the loader did not create the `E` matrix.
 
 
 
@@ -186,7 +186,7 @@ In mixed mode we have a single graph that acts as the support for different node
 In this case we have that: 
 
 - `A` is a matrix of shape `[node, node]`;
-- `X` is a tensor in batch mode, of shape `[batch, node, n_feat)`;
+- `X` is a tensor in batch mode, of shape `[batch, node, n_feat]`;
 
 Currently, there are no layers in Spektral that support mixed mode and edge attributes. 
 
diff --git a/docs/templates/examples.md b/docs/templates/examples.md
index 2219d895..265a4e6e 100644
--- a/docs/templates/examples.md
+++ b/docs/templates/examples.md
@@ -1,6 +1,6 @@
 # Examples
 
-This is a collection of examples that you can use as template to solve your own tasks. 
+This is a collection of examples that you can use as template for your projects. 
 
 ## Node-level prediction
 
diff --git a/docs/templates/external.md b/docs/templates/external.md
index 18ed0c5d..2ea39ed7 100644
--- a/docs/templates/external.md
+++ b/docs/templates/external.md
@@ -4,7 +4,7 @@ This is a collection of additional material about Spektral.
 
 ## Paper
 
-We have presented the library at the ICML 2020 workshop "Graph Representation Learning and Beyond".
+We presented the library at the ICML 2020 workshop "Graph Representation Learning and Beyond".
 
 Paper: 
 
diff --git a/docs/templates/getting-started.md b/docs/templates/getting-started.md
index 587b8d69..82eec7df 100644
--- a/docs/templates/getting-started.md
+++ b/docs/templates/getting-started.md
@@ -60,7 +60,7 @@ Datasets also provide methods for applying **transforms** to each data:
 - `map(transform)` - returns a list obtained by applying the `transform` to each graph;
 - `filter(function)` - removes from the dataset any graph for which `function(graph)` is `False`. This is also an in-place operation.
 
-For exampe, let's modify our dataset so that we only have graphs with less than 500 nodes:
+For example, let's modify our dataset so that we only have graphs with less than 500 nodes:
 
 ```python
 >>> dataset.filter(lambda g: g.n_nodes < 500)
@@ -225,7 +225,7 @@ print('Test loss: {}'.format(loss))
 
 ## Node-level learning
 
-Besides learning to predict labels for the whole graph, like in this tutorial, GNNs are very effective at learning to predict labels for each individual node. This is called "node-level learning" and we usually do it for datasets with one big graph (think a social network).
+Besides learning to predict labels for the whole graph, like in this tutorial, GNNs are very effective at learning to predict labels for each node. This is called "node-level learning" and we usually do it for datasets with one big graph (think a social network).
 
 For example, reproducing the results of the [GCN paper for classifying nodes in a citation network](https://arxiv.org/abs/1609.02907) can be done with `GCNConv` layers, the `Citation` dataset, and a `SingleLoader`: check out [this example](https://github.com/danielegrattarola/spektral/blob/master/examples/node_prediction/citation_gcn.py).
 
diff --git a/spektral/data/dataset.py b/spektral/data/dataset.py
index 127f7984..6f815f48 100644
--- a/spektral/data/dataset.py
+++ b/spektral/data/dataset.py
@@ -26,12 +26,12 @@ class Dataset:
     Datasets have the following properties that automatically computed from the
     graphs:
 
-        - `n_nodes`: the number of nodes in the dataset (returns `None` if the number
-        changes between graphs);
-        - `n_node_features`: the size of the node features (returns `None` if the size changes
-        between graphs or is not defined);
-        - `n_edge_features`: the size of the edge features (returns `None` if the size changes
-        between graphs or is not defined);
+        - `n_nodes`: the number of nodes in the dataset (returns `None` if the
+        number changes between graphs);
+        - `n_node_features`: the size of the node features (returns `None` if
+        the size changes between graphs or is not defined);
+        - `n_edge_features`: the size of the edge features (returns `None` if
+        the size changes between graphs or is not defined);
         - `n_labels`: the size of the labels (returns `None` if the size changes
         between graphs or is not defined); this is computed as the innermost
         dimension of the labels (i.e., `y.shape[-1]`).
@@ -51,12 +51,12 @@ class Dataset:
     - `map(transform, reduce=None)`: returns a list containing the output
     of `transform(graph)` for each graph. If `reduce` is a `callable`, then
     returns `reduce(output_list)` instead of just `output_list`.
-    For instance: `map(lambda: g.n_nodes, reduce=np.mean)` will return the average
-    number of nodes in the dataset.
+    For instance: `map(lambda: g.n_nodes, reduce=np.mean)` will return the
+    average number of nodes in the dataset.
     - `filter(function)`: removes from the dataset any graph for which
     `function(graph)` returns `False`.
-    For example: `filter(lambda: g.n_nodes < 100)` removes from the dataset all graphs
-    bigger than 100 nodes.
+    For example: `filter(lambda: g.n_nodes < 100)` removes from the dataset all
+    graphs bigger than 100 nodes.
 
     You can extend this class to create your own dataset.
     To create a `Dataset`, you must implement the `Dataset.read()` method, which
diff --git a/spektral/layers/base.py b/spektral/layers/base.py
index 755e0a8c..665498df 100644
--- a/spektral/layers/base.py
+++ b/spektral/layers/base.py
@@ -100,7 +100,7 @@ class InnerProduct(Layer):
 
     :param trainable_kernel: add a trainable square matrix between the inner
     product (e.g., `X @ W @ X.T`);
-    :param activation: activation function to use;
+    :param activation: activation function;
     :param kernel_initializer: initializer for the weights;
     :param kernel_regularizer: regularization applied to the kernel;
     :param kernel_constraint: constraint applied to the kernel;
@@ -183,7 +183,7 @@ class MinkowskiProduct(Layer):
     :param input_dim_1: first dimension of the input Tensor; set this if you
     encounter issues with shapes in your model, in order to provide an explicit
     output shape for your layer.
-    :param activation: activation function to use;
+    :param activation: activation function;
     """
 
     def __init__(self,
diff --git a/spektral/layers/convolutional/agnn_conv.py b/spektral/layers/convolutional/agnn_conv.py
index 5cb63129..1169ab31 100644
--- a/spektral/layers/convolutional/agnn_conv.py
+++ b/spektral/layers/convolutional/agnn_conv.py
@@ -44,7 +44,7 @@ class AGNNConv(MessagePassing):
 
     - `trainable`: boolean, if True, then beta is a trainable parameter.
     Otherwise, beta is fixed to 1;
-    - `activation`: activation function to use;
+    - `activation`: activation function;
     """
 
     def __init__(self, trainable=True, aggregate='sum', activation=None, **kwargs):
@@ -72,7 +72,7 @@ def message(self, x, x_norm=None):
         x_norm_i = self.get_i(x_norm)
         x_norm_j = self.get_j(x_norm)
         alpha = self.beta * tf.reduce_sum(x_norm_i * x_norm_j, axis=-1)
-        alpha = ops.unsorted_segment_softmax(alpha, self.index_i, self.N)
+        alpha = ops.unsorted_segment_softmax(alpha, self.index_i, self.n_nodes)
         alpha = alpha[:, None]
 
         return alpha * x_j
diff --git a/spektral/layers/convolutional/appnp_conv.py b/spektral/layers/convolutional/appnp_conv.py
index 7e1e0890..23f3a3f8 100644
--- a/spektral/layers/convolutional/appnp_conv.py
+++ b/spektral/layers/convolutional/appnp_conv.py
@@ -45,7 +45,7 @@ class APPNPConv(Conv):
     layer in the MLP (if None, the MLP has only the output layer);
     - `mlp_activation`: activation for the MLP layers;
     - `dropout_rate`: dropout rate for Laplacian and MLP layers;
-    - `activation`: activation function to use;
+    - `activation`: activation function;
     - `use_bias`: bool, add a bias vector to the output;
     - `kernel_initializer`: initializer for the weights;
     - `bias_initializer`: initializer for the bias vector;
diff --git a/spektral/layers/convolutional/arma_conv.py b/spektral/layers/convolutional/arma_conv.py
index f6d40b6a..ab39749b 100644
--- a/spektral/layers/convolutional/arma_conv.py
+++ b/spektral/layers/convolutional/arma_conv.py
@@ -50,10 +50,10 @@ class ARMAConv(Conv):
     stacks in the layer;
     - `iterations`: number of iterations to compute each ARMA\(_1\) approximation;
     - `share_weights`: share the weights in each ARMA\(_1\) stack.
-    - `gcn_activation`: activation function to use to compute each ARMA\(_1\)
+    - `gcn_activation`: activation function to compute each ARMA\(_1\)
     stack;
     - `dropout_rate`: dropout rate for skip connection;
-    - `activation`: activation function to use;
+    - `activation`: activation function;
     - `use_bias`: bool, add a bias vector to the output;
     - `kernel_initializer`: initializer for the weights;
     - `bias_initializer`: initializer for the bias vector;
diff --git a/spektral/layers/convolutional/cheb_conv.py b/spektral/layers/convolutional/cheb_conv.py
index b66dfb34..5329a88f 100644
--- a/spektral/layers/convolutional/cheb_conv.py
+++ b/spektral/layers/convolutional/cheb_conv.py
@@ -47,7 +47,7 @@ class ChebConv(Conv):
 
     - `channels`: number of output channels;
     - `K`: order of the Chebyshev polynomials;
-    - `activation`: activation function to use;
+    - `activation`: activation function;
     - `use_bias`: bool, add a bias vector to the output;
     - `kernel_initializer`: initializer for the weights;
     - `bias_initializer`: initializer for the bias vector;
diff --git a/spektral/layers/convolutional/crystal_conv.py b/spektral/layers/convolutional/crystal_conv.py
index a3cb3fb8..e4583a92 100644
--- a/spektral/layers/convolutional/crystal_conv.py
+++ b/spektral/layers/convolutional/crystal_conv.py
@@ -40,7 +40,7 @@ class CrystalConv(MessagePassing):
     **Arguments**
 
     - `channels`: integer, number of output channels;
-    - `activation`: activation function to use;
+    - `activation`: activation function;
     - `use_bias`: bool, add a bias vector to the output;
     - `kernel_initializer`: initializer for the weights;
     - `bias_initializer`: initializer for the bias vector;
diff --git a/spektral/layers/convolutional/ecc_conv.py b/spektral/layers/convolutional/ecc_conv.py
index 35377582..48d6c3a5 100644
--- a/spektral/layers/convolutional/ecc_conv.py
+++ b/spektral/layers/convolutional/ecc_conv.py
@@ -47,7 +47,7 @@ class ECCConv(Conv):
     the kernel-generating network;
     - 'root': if False, the layer will not consider the root node for computing
     the message passing (first term in equation above), but only the neighbours.
-    - `activation`: activation function to use;
+    - `activation`: activation function;
     - `use_bias`: bool, add a bias vector to the output;
     - `kernel_initializer`: initializer for the weights;
     - `bias_initializer`: initializer for the bias vector;
diff --git a/spektral/layers/convolutional/edge_conv.py b/spektral/layers/convolutional/edge_conv.py
index 6e8ec58c..6a528410 100644
--- a/spektral/layers/convolutional/edge_conv.py
+++ b/spektral/layers/convolutional/edge_conv.py
@@ -39,7 +39,7 @@ class EdgeConv(MessagePassing):
     - `mlp_hidden`: list of integers, number of hidden units for each hidden
     layer in the MLP (if None, the MLP has only the output layer);
     - `mlp_activation`: activation for the MLP layers;
-    - `activation`: activation function to use;
+    - `activation`: activation function;
     - `use_bias`: bool, add a bias vector to the output;
     - `kernel_initializer`: initializer for the weights;
     - `bias_initializer`: initializer for the bias vector;
diff --git a/spektral/layers/convolutional/gat_conv.py b/spektral/layers/convolutional/gat_conv.py
index 25201165..f74132a4 100644
--- a/spektral/layers/convolutional/gat_conv.py
+++ b/spektral/layers/convolutional/gat_conv.py
@@ -59,7 +59,7 @@ class GATConv(Conv):
     - `dropout_rate`: internal dropout rate for attention coefficients;
     - `return_attn_coef`: if True, return the attention coefficients for
     the given input (one n_nodes x n_nodes matrix for each head).
-    - `activation`: activation function to use;
+    - `activation`: activation function;
     - `use_bias`: bool, add a bias vector to the output;
     - `kernel_initializer`: initializer for the weights;
     - `attn_kernel_initializer`: initializer for the attention weights;
diff --git a/spektral/layers/convolutional/gated_graph_conv.py b/spektral/layers/convolutional/gated_graph_conv.py
index a8bc038c..d372412f 100644
--- a/spektral/layers/convolutional/gated_graph_conv.py
+++ b/spektral/layers/convolutional/gated_graph_conv.py
@@ -41,7 +41,7 @@ class GatedGraphConv(MessagePassing):
 
     - `channels`: integer, number of output channels;
     - `n_layers`: integer, number of iterations with the GRU cell;
-    - `activation`: activation function to use;
+    - `activation`: activation function;
     - `use_bias`: bool, add a bias vector to the output;
     - `kernel_initializer`: initializer for the weights;
     - `bias_initializer`: initializer for the bias vector;
diff --git a/spektral/layers/convolutional/gcn_conv.py b/spektral/layers/convolutional/gcn_conv.py
index b0a393df..b3394339 100644
--- a/spektral/layers/convolutional/gcn_conv.py
+++ b/spektral/layers/convolutional/gcn_conv.py
@@ -35,7 +35,7 @@ class GCNConv(Conv):
     **Arguments**
 
     - `channels`: number of output channels;
-    - `activation`: activation function to use;
+    - `activation`: activation function;
     - `use_bias`: bool, add a bias vector to the output;
     - `kernel_initializer`: initializer for the weights;
     - `bias_initializer`: initializer for the bias vector;
diff --git a/spektral/layers/convolutional/gcs_conv.py b/spektral/layers/convolutional/gcs_conv.py
index 13e98d3d..7799b9a6 100644
--- a/spektral/layers/convolutional/gcs_conv.py
+++ b/spektral/layers/convolutional/gcs_conv.py
@@ -31,7 +31,7 @@ class GCSConv(Conv):
     **Arguments**
 
     - `channels`: number of output channels;
-    - `activation`: activation function to use;
+    - `activation`: activation function;
     - `use_bias`: bool, add a bias vector to the output;
     - `kernel_initializer`: initializer for the weights;
     - `bias_initializer`: initializer for the bias vector;
diff --git a/spektral/layers/convolutional/general_conv.py b/spektral/layers/convolutional/general_conv.py
index 764daa84..e0ee6bc1 100644
--- a/spektral/layers/convolutional/general_conv.py
+++ b/spektral/layers/convolutional/general_conv.py
@@ -62,7 +62,7 @@ class GeneralConv(MessagePassing):
     - `dropout`: float, dropout rate;
     - `aggregate`: string or callable, an aggregation function. Supported
     aggregations: 'sum', 'mean', 'max', 'min', 'prod'.
-    - `activation`: activation function to use. This layer also supports the
+    - `activation`: activation function. This layer also supports the
     advanced activation PReLU by passing `activation='prelu'`.
     - `use_bias`: bool, add a bias vector to the output;
     - `kernel_initializer`: initializer for the weights;
diff --git a/spektral/layers/convolutional/gin_conv.py b/spektral/layers/convolutional/gin_conv.py
index 353e62d6..3b356ff6 100644
--- a/spektral/layers/convolutional/gin_conv.py
+++ b/spektral/layers/convolutional/gin_conv.py
@@ -43,7 +43,7 @@ class GINConv(MessagePassing):
     - `mlp_hidden`: list of integers, number of hidden units for each hidden
     layer in the MLP (if None, the MLP has only the output layer);
     - `mlp_activation`: activation for the MLP layers;
-    - `activation`: activation function to use;
+    - `activation`: activation function;
     - `use_bias`: bool, add a bias vector to the output;
     - `kernel_initializer`: initializer for the weights;
     - `bias_initializer`: initializer for the bias vector;
diff --git a/spektral/layers/convolutional/graphsage_conv.py b/spektral/layers/convolutional/graphsage_conv.py
index 93219cbf..5667972e 100644
--- a/spektral/layers/convolutional/graphsage_conv.py
+++ b/spektral/layers/convolutional/graphsage_conv.py
@@ -39,7 +39,7 @@ class GraphSageConv(MessagePassing):
     - `channels`: number of output channels;
     - `aggregate_op`: str, aggregation method to use (`'sum'`, `'mean'`,
     `'max'`, `'min'`, `'prod'`);
-    - `activation`: activation function to use;
+    - `activation`: activation function;
     - `use_bias`: bool, add a bias vector to the output;
     - `kernel_initializer`: initializer for the weights;
     - `bias_initializer`: initializer for the bias vector;
diff --git a/spektral/layers/convolutional/message_passing.py b/spektral/layers/convolutional/message_passing.py
index f227ea7c..0e5e7460 100644
--- a/spektral/layers/convolutional/message_passing.py
+++ b/spektral/layers/convolutional/message_passing.py
@@ -49,7 +49,8 @@ class MessagePassing(Layer):
     Any extra keyword argument of this function will be populated by
     `propagate()` if a matching keyword is found. <br>
     Use `self.get_i()` and  `self.get_j()` to gather the elements using the
-    indices `i` or `j` of the adjacency matrix.
+    indices `i` or `j` of the adjacency matrix. Equivalently, you can access
+    the indices themselves via the `index_i` and `index_j` attributes.
 
     ```python
     aggregate(messages, **kwargs)
@@ -102,7 +103,7 @@ def build(self, input_shape):
         self.built = True
 
     def propagate(self, x, a, e=None, **kwargs):
-        self.N = tf.shape(x)[0]
+        self.n_nodes = tf.shape(x)[0]
         self.index_i = a.indices[:, 1]
         self.index_j = a.indices[:, 0]
 
@@ -124,7 +125,7 @@ def message(self, x, **kwargs):
         return self.get_j(x)
 
     def aggregate(self, messages, **kwargs):
-        return self.agg(messages, self.index_i, self.N)
+        return self.agg(messages, self.index_i, self.n_nodes)
 
     def update(self, embeddings, **kwargs):
         return embeddings
diff --git a/spektral/layers/convolutional/tag_conv.py b/spektral/layers/convolutional/tag_conv.py
index aea20a70..b883d32f 100644
--- a/spektral/layers/convolutional/tag_conv.py
+++ b/spektral/layers/convolutional/tag_conv.py
@@ -36,7 +36,7 @@ class TAGConv(MessagePassing):
     - `channels`: integer, number of output channels;
     - `K`: the order of the layer (i.e., the layer will consider a K-hop
     neighbourhood for each node);
-    - `activation`: activation function to use;
+    - `activation`: activation function;
     - `use_bias`: bool, add a bias vector to the output;
     - `kernel_initializer`: initializer for the weights;
     - `bias_initializer`: initializer for the bias vector;

From a6707fc1fb63da2c434c414e2d341a68bf36de2b Mon Sep 17 00:00:00 2001
From: Daniele Grattarola <daniele.grattarola@gmail.com>
Date: Mon, 30 Nov 2020 12:36:21 +0100
Subject: [PATCH 56/57] Update CONTRIBUTING.md and README.md

---
 CONTRIBUTING.md | 28 +++++++++-------------------
 README.md       |  3 +--
 2 files changed, 10 insertions(+), 21 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 763a2b93..0039c179 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -8,14 +8,12 @@ If you found a bug in the latest version of Spektral, you can [open an issue](ht
 
 Before opening the issue, make sure to follow these steps: 
 
-1. Update to the current master branch and see if the problem is already solved. Sometimes a change does not get released immediately on PyPi, so it might be a good idea to install from source. 
+1. Update to the current `develop` branch and see if the problem is already solved. Sometimes a change does not get released immediately on PyPi, so it might be a good idea to install from source. 
 2. Check old issues to see if the problem was already solved. 
-3. Make sure that your configuration checks all requirements, including: 
+3. Make sure that your configuration matches all requirements, including: 
     - Operating system
     - Python version
-    - Version of Spektral
-    - Version of Tensorflow and Keras (note that since version 0.3 Spektral only supports `tf.keras`)
-    - Version of CUDA and cuDNN
+    - Tensorflow version (note that since version 0.3 Spektral only supports `tf.keras` and TensorFlow 2.1 and above)
 4. Provide a minimal script to reproduce the issue. The script should be runnable as-is, with no modification or external data. 
 5. Include any stack trace/errors that you get.
 
@@ -27,7 +25,7 @@ Bug fixes should be added to the `master` branch.
 
 ## Feature requests
 
-If you want to request a feature, [open an issue](https://github.com/danielegrattarola/spektral/issues) on GitHub and clearly mark it as a feature request.
+If you want to request a feature, [open an issue](https://github.com/danielegrattarola/spektral/issues) on GitHub and mark it as a feature request.
 
 1. Give a detailed description of the feature, including why it is important and why it fits in the scope of the project. 
 Spektral is primarily a library for creating graph neural networks, so new features should gravitate around this subject.
@@ -47,7 +45,7 @@ There are no hard rules for contributing to Spektral, but you should try to foll
 **General guidelines:**
 
 - Format your code according to PEP8;
-- Make sure that the code you contribute is clearly identifiable in a PR (e.g., watch out for your IDE automatically reformatting the whole project);
+- Make sure that the code you contribute is clearly identifiable in a PR (e.g., watch out for your IDE automatically reformatting files);
 - New features should support:
     - Python >= 3.5
     - TensorFlow >= 2.1.0
@@ -64,23 +62,15 @@ There are no hard rules for contributing to Spektral, but you should try to foll
 - Message-passing/convolutional layers go in their own file in `layers/convolutional/`;
 - Pooling layers go in their own file in `layers/pooling/`;
 - Global pooling layers go in `layers/pooling/globalpool.py`;
-- Layers should extend `tensorflow.keras.layers.Layer` and implement the following methods: 
-    - `build()`
-    - `call()`
-    - `compute_output_shape()`
-    - `get_config()`
-- Convolutional layers should also implement a `preprocess(A)` staticmethod to manipulate/normalize the adjacency matrix before giving it as input to the GNN (e.g., the `preoprocess()` method of GCN adds self-loops and then re-scales each row by the degree);
-- Make sure that you understand [data modes](https://spektral.graphneural.network/data/) and that you know the modes supported by your layer. Layers should support at least single or batch mode;
-- Many layers in Spektral inherit their base functionality from `GraphConv`, check if this is the case for yours as well;
-- There is also a `MessagePassing` layer that offers a quick API to implement convolutional layers in single/disjoint mode;
+- Layers should extend `MessagePassing`, `Conv` or `Pool`.
+- Make sure that you understand [data modes](https://graphneural.network/data-modes/) and that you know the modes supported by your layer. Layers should support at least one of disjoint or batch mode;
 
 **Guidelines for testing:**
 
 - Tests are found in `tests/`;
-- It's especially important to add tests for any new layer. See `tests/test_layers/` and the files contained there;
-- See the comments in each test `.py` for more information;
+- See the docstrings in each file for more information;
 
 **Guidelines for the documentation:**
 
 - See the documentation in the other layers for how to format docstrings (it's important that the format is the same so that the docs can be built automatically);
-- Docs are automatically generated using `docs/autogen.py`. Make sure to include any new Layer as an entry in the `PAGES` dictionary. It is not necessary to add utils and other minor functions to `autogen.py` (although you should still write the docstrings). 
+- Docs are automatically generated using `docs/autogen.py`. Make sure to include any new layer as an entry in the `PAGES` dictionary. It is not necessary to add utils and other minor functions to `autogen.py` (although you should still write the docstrings). 
diff --git a/README.md b/README.md
index 7bc3c57c..67adf41a 100644
--- a/README.md
+++ b/README.md
@@ -70,8 +70,7 @@ To install Spektral on [Google Colab](https://colab.research.google.com/):
 
 The 1.0 release of Spektral is an important milestone for the library and brings many new features and improvements. 
 
-If you have already used Spektral in your projects, the only major change that you need to be aware of is in the `datasets` API.
-Your models will continue to work in exactly the same way.
+If you have already used Spektral in your projects, the only major change that you need to be aware of is the new `datasets` API.
 
 This is a summary of the new features and changes: 
 

From c9167f2ffd0360ecb3b2ad8c1193bc5ef926b96d Mon Sep 17 00:00:00 2001
From: Daniele Grattarola <daniele.grattarola@gmail.com>
Date: Mon, 30 Nov 2020 13:11:58 +0100
Subject: [PATCH 57/57] Rename examples, fix links

---
 docs/templates/creating-dataset.md            |  4 +---
 docs/templates/creating-layer.md              |  8 +++----
 docs/templates/examples.md                    | 14 ++++++-----
 docs/templates/getting-started.md             | 23 ++++++++-----------
 ...-esol_batch.py => ogbg-mol-esol_mincut.py} |  0
 ...ol-hiv_disjoint.py => ogbg-mol-hiv_ecc.py} |  0
 .../{qm9_disjoint.py => qm9_ecc.py}           |  0
 .../{qm9_batch.py => qm9_ecc_batch.py}        |  0
 .../{tud_disjoint.py => tud_gin.py}           |  0
 ...ion_gat_fast.py => citation_gat_custom.py} |  4 ++--
 ...ion_gcn_fast.py => citation_gcn_custom.py} |  0
 11 files changed, 25 insertions(+), 28 deletions(-)
 rename examples/graph_prediction/{ogbg-mol-esol_batch.py => ogbg-mol-esol_mincut.py} (100%)
 rename examples/graph_prediction/{ogbg-mol-hiv_disjoint.py => ogbg-mol-hiv_ecc.py} (100%)
 rename examples/graph_prediction/{qm9_disjoint.py => qm9_ecc.py} (100%)
 rename examples/graph_prediction/{qm9_batch.py => qm9_ecc_batch.py} (100%)
 rename examples/graph_prediction/{tud_disjoint.py => tud_gin.py} (100%)
 rename examples/node_prediction/{citation_gat_fast.py => citation_gat_custom.py} (96%)
 rename examples/node_prediction/{citation_gcn_fast.py => citation_gcn_custom.py} (100%)

diff --git a/docs/templates/creating-dataset.md b/docs/templates/creating-dataset.md
index d2e5c50d..019d0011 100644
--- a/docs/templates/creating-dataset.md
+++ b/docs/templates/creating-dataset.md
@@ -8,12 +8,10 @@ This is also useful if you want to share you datasets publicly or include them a
 
 ## Essential information
 
-There are just a few things to know about datasets before you can create your own.
-
 You create a dataset by subclassing the `spektral.data.Dataset` class. 
 
 The core of datasets is the `read()` method. This is called at every instantiation of the dataset and must return a list of `spektral.data.Graph`.
-It doesn't matter if you read the data from a file or create it on the fly, this is simply where the dataset is loaded in memory. 
+It doesn't matter if you read the data from a file or create it on the fly, this is where the dataset is loaded in memory. 
 
 All datasets have a `path` property that represents the directory in which the data is stored. This defaults to `~/.spektral/datasets/[ClassName]`.
 You can ignore it if you want.<br>
diff --git a/docs/templates/creating-layer.md b/docs/templates/creating-layer.md
index 879be72b..5ad15a66 100644
--- a/docs/templates/creating-layer.md
+++ b/docs/templates/creating-layer.md
@@ -40,7 +40,7 @@ class GCN(MessagePassing):
 
 Note that the Keras keyword `activation` was passed to the constructor of the superclass. This can be done with any Keras keyword (like regularizers, constraints, etc) and the layer will process them automatically. 
 
-By default, the `call` method of MessagePassing layers will only call `propagate`. We modify it so that it also updates the node features before starting the propagation:
+By default, the `call` method of MessagePassing layers will only call `propagate`. We modify it so that it also transforms the node features before starting the propagation:
 
 ```py
 def call(self, inputs):
@@ -52,10 +52,10 @@ def call(self, inputs):
     return self.propagate(x=x, a=a)
 ```
 
-Then, we implement the `message` function, which only needs to get the neighbours for each node.<br>
+Then, we implement the `message` function.
 The `get_i` and `get_j` built-in methods can be used to automatically access either side of the edges \(i \leftarrow j\). For instance, we can use `get_j` to access the node features `x[j]` of all neighbors `j`.
 
-If you need direct access to the edge indices, you can use the `index_i` and `index_j` attributes directly.
+If you need direct access to the edge indices, you can use the `index_i` and `index_j` attributes.
 
 In this case, we only need to get the neighbors' features and return them: 
 
@@ -95,7 +95,7 @@ This is enough to get started with building your own layers in Spektral.
 
 ## Notes
 
-An important feature of the MessagePassing class is that any extra keyword argument given to `propagate`, will be matched to the signature of `message`, `aggregate` and `update` and forwarded to those functions if a match is found. 
+An important feature of the MessagePassing class is that any extra keyword argument given to `propagate`, will be compared to the signatures of `message`, `aggregate` and `update` and forwarded to those functions if a match is found. 
 
 For example, we can call:
 
diff --git a/docs/templates/examples.md b/docs/templates/examples.md
index 265a4e6e..562bdbc1 100644
--- a/docs/templates/examples.md
+++ b/docs/templates/examples.md
@@ -5,23 +5,25 @@ This is a collection of examples that you can use as template for your projects.
 ## Node-level prediction
 
 - [Citation networks with GCN](https://github.com/danielegrattarola/spektral/blob/master/examples/node_prediction/citation_gcn.py)
+- [Citation networks with GCN (custom training loop)](https://github.com/danielegrattarola/spektral/blob/master/examples/node_prediction/citation_gcn_custom.py)
 - [Citation networks with ChebConv](https://github.com/danielegrattarola/spektral/blob/master/examples/node_prediction/citation_cheby.py)
 - [Citation networks with GAT](https://github.com/danielegrattarola/spektral/blob/master/examples/node_prediction/citation_gat.py)
+- [Citation networks with GAT (custom training loop)](https://github.com/danielegrattarola/spektral/blob/master/examples/node_prediction/citation_gat_custom.py)
 - [Citation networks with ARMA](https://github.com/danielegrattarola/spektral/blob/master/examples/node_prediction/citation_arma.py)
 - [Citation networks with SimpleGCN (custom transform)](https://github.com/danielegrattarola/spektral/blob/master/examples/node_prediction/citation_simple_gc.py)
-- [Open Graph Benchmark dataset](https://github.com/danielegrattarola/spektral/blob/master/examples/node_prediction/ogbn-proteins_gcn.py)
+- [Open Graph Benchmark dataset](https://github.com/danielegrattarola/spektral/blob/master/examples/node_prediction/ogbn-arxiv_gcn.py)
 
 ## Graph-level prediction
 
 - [General GNN](https://github.com/danielegrattarola/spektral/blob/master/examples/graph_prediction/general_gnn.py)
 - [Custom dataset](https://github.com/danielegrattarola/spektral/blob/master/examples/graph_prediction/custom_dataset.py)
 - [OGB mol-esol regression with MinCut pooling](https://github.com/danielegrattarola/spektral/blob/master/examples/graph_prediction/ogbg-mol-esol_batch.py)
-- [OGB mol-hiv classification using edge attributes](https://github.com/danielegrattarola/spektral/blob/master/examples/graph_prediction/ogbg-mol-esol_batch.py)
-- [Regression on QM9 with ECC](https://github.com/danielegrattarola/spektral/blob/master/examples/graph_prediction/qm9_batch.py)
-- [Regression on QM9 with ECC and custom training loop](https://github.com/danielegrattarola/spektral/blob/master/examples/graph_prediction/qm9_disjoint.py)
-- [TUDataset classification with GIN](https://github.com/danielegrattarola/spektral/blob/master/examples/graph_prediction/tud_disjoint.py)
+- [OGB mol-hiv classification (edge attributes)](https://github.com/danielegrattarola/spektral/blob/master/examples/graph_prediction/ogbg-mol-esol_batch.py)
+- [QM9 regression with ECC (custom training loop)](https://github.com/danielegrattarola/spektral/blob/master/examples/graph_prediction/qm9_ecc.py)
+- [QM9 regression with ECC (batch mode)](https://github.com/danielegrattarola/spektral/blob/master/examples/graph_prediction/qm9_ecc_batch.py)
+- [TUDataset classification with GIN](https://github.com/danielegrattarola/spektral/blob/master/examples/graph_prediction/tud_gin.py)
 
 ## Other applications
 
 - [Graph signal classification on MNIST (mixed mode)](https://github.com/danielegrattarola/spektral/blob/master/examples/other/graph_signal_classification_mnist.py)
-- [Node clustering on citation networks with minCUT pooling (unsupervised)](https://github.com/danielegrattarola/spektral/blob/master/examples/other/node_clustering_mincut.py)
+- [Node clustering on citation networks with MinCut pooling (unsupervised)](https://github.com/danielegrattarola/spektral/blob/master/examples/other/node_clustering_mincut.py)
diff --git a/docs/templates/getting-started.md b/docs/templates/getting-started.md
index 82eec7df..747f170c 100644
--- a/docs/templates/getting-started.md
+++ b/docs/templates/getting-started.md
@@ -1,6 +1,6 @@
 # Getting started
 
-Spektral is designed according to the guiding principles of the Keras API to make things extremely simple for beginners while maintaining flexibility for experts.  
+Spektral is designed according to the guiding principles of Keras to make things extremely simple for beginners while maintaining flexibility for experts.  
 
 In this page we will go over the main features of Spektral while creating a graph neural network for graph classification. 
 
@@ -17,7 +17,7 @@ In Spektral, graphs are represented with instances of `spektral.data.Graph` whic
 - `e`: the **edge features** - usually represented in a sparse edge list format, with a `np.array` of shape `(n_edges, n_edge_features)`.
 - `y`: the **labels** - can represent anything, from graph labels to node labels, or even something else. 
 
-A graph can have all of these attributes or none of them. You can even add extra attributes if you want: after all, a `Graph` is just a plain Python object. For instance, see `graph.n_nodes`, `graph.n_node_features`, etc.
+A graph can have all of these attributes or none of them. Since Graphs are just plain Python objects, you can also add extra attributes if you want. For instance, see `graph.n_nodes`, `graph.n_node_features`, etc.
 
 ## Datasets
 
@@ -90,14 +90,14 @@ Now we are ready to augment our node features with the one-hot-encoded degree. S
 >>> dataset.apply(Degree(max_degree))
 ```
 
-We can see that it worked because now we have an extra `max_degree + 1` node features, which are our one-hot vectors:
+We can see that it worked because now we have an extra `max_degree + 1` node features:
 
 ```python
 >>> dataset[0]
 Graph(n_nodes=42, n_node_features=17, n_edge_features=None, y=[1. 0.])
 ```
 
-Since we will be using a `GCNConv` layer in our GNN, we also want to follow the [original paper](https://arxiv.org/abs/1609.02907) that introduced this layer, and do some extra pre-processing of the adjacency matrix. 
+Since we will be using a `GCNConv` layer in our GNN, we also want to follow the [original paper](https://arxiv.org/abs/1609.02907) that introduced this layer and do some extra pre-processing of the adjacency matrix. 
 
 Since this is a fairly common operation, Spektral has a transform to do it: 
 
@@ -196,9 +196,7 @@ and we can finally train our GNN!
 Since loaders are essentially generators, we need to provide the `steps_per_epoch` keyword to `model.fit()` and we don't need to specify a batch size:
 
 ```python
-model.fit(loader.load(),
-          steps_per_epoch=loader.steps_per_epoch,
-          epochs=10)
+model.fit(loader.load(), steps_per_epoch=loader.steps_per_epoch, epochs=10)
 ``` 
 
 Done!
@@ -218,8 +216,8 @@ loader = BatchLoader(dataset_test, batch_size=32)
 and feed it to the model by calling `load()`:
 
 ```python
-loss = model.evaluate(loader.load(), 
-                      steps=loader.steps_per_epoch)
+loss = model.evaluate(loader.load(), steps=loader.steps_per_epoch)
+
 print('Test loss: {}'.format(loss))
 ```
 
@@ -243,7 +241,6 @@ Make sure to read the documentation, and get in touch [on Github](https://github
 
 If you want to cite Spektral in your work, refer to our paper: 
 
-> Graph Neural Networks in TensorFlow and Keras with Spektral  
-> D. Grattarola and C. Alippi  
-> ICML 2020 - GRL+ Workshop  
-> [https://arxiv.org/abs/2006.12138](https://arxiv.org/abs/2006.12138)  
+> [Graph Neural Networks in TensorFlow and Keras with Spektral](https://arxiv.org/abs/2006.12138) <br>
+> Daniele Grattarola and Cesare Alippi  
+
diff --git a/examples/graph_prediction/ogbg-mol-esol_batch.py b/examples/graph_prediction/ogbg-mol-esol_mincut.py
similarity index 100%
rename from examples/graph_prediction/ogbg-mol-esol_batch.py
rename to examples/graph_prediction/ogbg-mol-esol_mincut.py
diff --git a/examples/graph_prediction/ogbg-mol-hiv_disjoint.py b/examples/graph_prediction/ogbg-mol-hiv_ecc.py
similarity index 100%
rename from examples/graph_prediction/ogbg-mol-hiv_disjoint.py
rename to examples/graph_prediction/ogbg-mol-hiv_ecc.py
diff --git a/examples/graph_prediction/qm9_disjoint.py b/examples/graph_prediction/qm9_ecc.py
similarity index 100%
rename from examples/graph_prediction/qm9_disjoint.py
rename to examples/graph_prediction/qm9_ecc.py
diff --git a/examples/graph_prediction/qm9_batch.py b/examples/graph_prediction/qm9_ecc_batch.py
similarity index 100%
rename from examples/graph_prediction/qm9_batch.py
rename to examples/graph_prediction/qm9_ecc_batch.py
diff --git a/examples/graph_prediction/tud_disjoint.py b/examples/graph_prediction/tud_gin.py
similarity index 100%
rename from examples/graph_prediction/tud_disjoint.py
rename to examples/graph_prediction/tud_gin.py
diff --git a/examples/node_prediction/citation_gat_fast.py b/examples/node_prediction/citation_gat_custom.py
similarity index 96%
rename from examples/node_prediction/citation_gat_fast.py
rename to examples/node_prediction/citation_gat_custom.py
index c01fed50..121dcfb4 100644
--- a/examples/node_prediction/citation_gat_fast.py
+++ b/examples/node_prediction/citation_gat_custom.py
@@ -1,5 +1,5 @@
 """
-This script is an extension of the citation_gcn_fast.py script.
+This script is an extension of the citation_gcn_custom.py script.
 It shows how to train GAT (with the same experimental setting of the original
 paper), using faster training and test functions.
 """
@@ -98,6 +98,6 @@ def evaluate():
     else:
         current_patience -= 1
         if current_patience == 0:
-            print('Best test acc: {}'.format(best_test_acc))
+            print('Test accuracy: {}'.format(best_test_acc))
             break
 toc('GAT ({} epochs)'.format(epoch))
diff --git a/examples/node_prediction/citation_gcn_fast.py b/examples/node_prediction/citation_gcn_custom.py
similarity index 100%
rename from examples/node_prediction/citation_gcn_fast.py
rename to examples/node_prediction/citation_gcn_custom.py