From e0f054fb2754247752044795b53195ea1ce0bd8a Mon Sep 17 00:00:00 2001
From: RhettYing <rhett_ying@qq.com>
Date: Thu, 10 Feb 2022 10:06:40 +0000
Subject: [PATCH] [Doc] add doc for DGLCSVDataset

---
 docs/source/api/python/dgl.data.rst |   1 +
 docs/source/guide/data-loadcsv.rst  | 607 ++++++++++++++++++++++++++++
 docs/source/guide/data.rst          |   4 +-
 python/dgl/data/csv_dataset.py      |   8 +-
 4 files changed, 618 insertions(+), 2 deletions(-)
 create mode 100644 docs/source/guide/data-loadcsv.rst

diff --git a/docs/source/api/python/dgl.data.rst b/docs/source/api/python/dgl.data.rst
index 108e012d47c2..8c6f0a6d9a50 100644
--- a/docs/source/api/python/dgl.data.rst
+++ b/docs/source/api/python/dgl.data.rst
@@ -18,6 +18,7 @@ Base Dataset Class
 .. autoclass:: DGLDataset
     :members: download, save, load, process, has_cache, __getitem__, __len__
 
+.. autoclass:: DGLCSVDataset
 
 Node Prediction Datasets
 ---------------------------------------
diff --git a/docs/source/guide/data-loadcsv.rst b/docs/source/guide/data-loadcsv.rst
new file mode 100644
index 000000000000..eaec2eeb971f
--- /dev/null
+++ b/docs/source/guide/data-loadcsv.rst
@@ -0,0 +1,607 @@
+.. _guide-data-pipeline-loadcsv:
+
+4.6 Loading datasets from CSV files
+----------------------------------------------
+
+Problem & Motivation
+~~~~~~~~~~~~~~~~~~~~
+
+With the growing interests in graph deep learning, many ML researchers or data scientists
+wish to try GNN models on custom datasets. Although DGL has a recommended practice on how
+a dataset object should behave (see :ref:`guide-data-pipeline-dataset`) once loaded into
+RAM, the on-disk storage format is still largely arbitrary. This proposal is to define an
+on-disk graph storage format based on Comma Separated Value (CSV) as well as to add a new
+dataset class called :class:`~dgl.data.DGLCSVDataset` for loading and processing it to
+accord with the current data pipeline practice. We choose CSV format due to its wide
+acceptance, good readability and rich set of toolkits for loading, creating and manipulating
+it (e.g., ``pandas``).
+
+Use :class:`~dgl.data.DGLCSVDataset` in DGL
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To create a DGLCSVDataset object:
+
+.. code:: python
+
+    import dgl
+    ds = dgl.data.DGLCSVDataset('/path/to/dataset')
+
+The returned ``ds`` object is as standard :class:`~dgl.dadta.DGLDataset`. For example, if the
+dataset is for single-graph node classification, you can use it as:
+
+.. code:: python
+
+    g = ds[0] # get the graph
+    label = g.ndata['label']
+    feat = g.ndata['feat']
+
+Data folder structure
+~~~~~~~~~~~~~~~~~~~~~
+
+.. code::
+
+    /path/to/dataset/
+    |-- meta.yaml     # metadata of the dataset
+    |-- edges_0.csv   # edge-level features
+    |-- ...   # you can have as many CSVs for edge data as you want
+    |-- nodes_0.csv   # node-level features
+    |-- ...   # you can have as many CSVs for node data as you want
+    |-- graphs.csv    # graph-level features
+
+Node/edge/graph-level data are stored in CSV files. ``meta.yaml`` is a metadata file specifying
+where to read nodes/edges/graphs data and how to parse them in order to construct the dataset
+object. A minimal data folder contains one ``meta.yaml`` and two CSVs, one for node data and one
+for edge data, in which case the dataset only contains a single graph with no graph-level data.
+
+Examples
+~~~~~~~~
+
+Dataset of a single feature-less graph
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+When the dataset contains only one graph with no node or edge features, there need only three
+files in the data folder: meta.yaml, one CSV for node IDs and one CSV for edges:
+
+.. code::
+
+    ./mini_featureless_dataset/
+    |-- meta.yaml
+    |-- nodes.csv
+    |-- edges.csv
+
+``meta.yaml`` contains the following information:
+
+.. code:: yaml
+
+    dataset_name: mini_featureless_dataset
+    edge_data:
+    - file_name: edges.csv
+    node_data:
+    - file_name: nodes.csv
+
+``nodes.csv`` lists the node IDs under the ``node_id`` field:
+
+.. code::
+
+    node_id
+    0
+    1
+    2
+    3
+    4
+
+``edges.csv`` lists all the edges in two columns (``src_id`` and ``dst_id``) specifying the
+source and destination node ID of each edge:
+
+.. code::
+
+    src_id,dst_id
+    4,4
+    4,1
+    3,0
+    4,1
+    4,0
+    1,2
+    1,3
+    3,3
+    1,1
+    4,1
+
+After loaded, the dataset has one graph without any features:
+
+.. code:: python
+
+    import dgl
+    dataset = dgl.data.DGLCSVDataset('./mini_featureless_dataset')
+    g = dataset[0]  # only one graph
+    print(g)
+    #Graph(num_nodes=5, num_edges=10,
+    #    ndata_schemes={}
+    #    edata_schemes={})
+
+
+A graph without any feature is often of less interest. In the next example, we will show
+how node or edge features are stored.
+
+.. note::
+    Graph generated here is always directed. If you need reverse edges, please specify manually.
+
+Dataset of a single graph with features and labels
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+When the dataset contains only one graph with node or edge features and labels, there still
+need only three files in the data folder: ``meta.yaml``, one CSV for node IDs and one CSV
+for edges:
+
+.. code::
+
+    ./mini_feature_dataset/
+    |-- meta.yaml
+    |-- nodes.csv
+    |-- edges.csv
+
+``meta.yaml``:
+
+.. code:: yaml
+
+    dataset_name: mini_feature_dataset
+    edge_data:
+    - file_name: edges.csv
+    node_data:
+    - file_name: nodes.csv
+
+``edges.csv``:
+
+.. code::
+
+    src_id,dst_id,label,train_mask,val_mask,test_mask,feat
+    4,0,2,False,True,True,"[0.5477868606453535, 0.4470617033458436, 0.936706701616337]"
+    4,0,0,False,False,True,"[0.9794634290792008, 0.23682038840665198, 0.049629338970987646]"
+    0,3,1,True,True,True,"[0.8586722047523594, 0.5746912787380253, 0.6462162561249654]"
+    0,1,2,True,False,False,"[0.2730008213674695, 0.5937484188166621, 0.765544096939567]"
+    0,2,1,True,True,True,"[0.45441619816038514, 0.1681403185591509, 0.9952376085297715]"
+    0,0,0,False,False,False,"[0.4197669213305396, 0.849983324532477, 0.16974127573016262]"
+    2,2,1,False,True,True,"[0.5495035052928215, 0.21394654203489705, 0.7174910641836348]"
+    1,0,2,False,True,False,"[0.008790817766266334, 0.4216530595907526, 0.529195480661293]"
+    3,0,0,True,True,True,"[0.6598715708878852, 0.1932390907048961, 0.9774471538377553]"
+    4,0,1,False,False,False,"[0.16846068931179736, 0.41516080644186737, 0.002158116134429955]"
+
+
+``nodes.csv``:
+
+.. code::
+
+    node_id,label,train_mask,val_mask,test_mask,feat
+    0,1,False,True,True,"[0.07816474278491703, 0.9137336384979067, 0.4654086994009452]"
+    1,1,True,True,True,"[0.05354099924658973, 0.8753101998792645, 0.33929432608774135]"
+    2,1,True,False,True,"[0.33234211884156384, 0.9370522452510665, 0.6694943496824788]"
+    3,0,False,True,False,"[0.9784264442230887, 0.22131880861864428, 0.3161154827254189]"
+    4,1,True,True,False,"[0.23142237259162102, 0.8715767748481147, 0.19117861103555467]"
+
+After loaded, the dataset has one graph with features and labels:
+
+.. code:: python
+
+    import dgl
+    dataset = dgl.data.DGLCSVDataset('./mini_feature_dataset')
+    g = dataset[0]  # only one graph
+    print(g)
+    #Graph(num_nodes=5, num_edges=10,
+    #    ndata_schemes={'label': Scheme(shape=(), dtype=torch.int64), 'train_mask': Scheme(shape=(), dtype=torch.bool), 'val_mask': Scheme(shape=(), dtype=torch.bool), 'test_mask': Scheme(shape=(), dtype=torch.bool), 'feat': Scheme(shape=(3,), dtype=torch.float64)}
+    #    edata_schemes={'label': Scheme(shape=(), dtype=torch.int64), 'train_mask': Scheme(shape=(), dtype=torch.bool), 'val_mask': Scheme(shape=(), dtype=torch.bool), 'test_mask': Scheme(shape=(), dtype=torch.bool), 'feat': Scheme(shape=(3,), dtype=torch.float64)})
+
+.. note::
+    All columns will be read and set as edge/node attributes except ``node_id`` in ``nodes.csv``,
+    ``src_id`` and ``dst_id`` in ``edges.csv``. User is able to access directly like: ``g.ndata[‘label’]``.
+    All the data in each row should be either numeric or a list of numeric. As for the list of numerics
+    which probably is the format of ``feat``, it’s a string in a raw CSV cell. Such a string will be
+    converted back to a list of numerics when read.
+
+Dataset of a single heterogeneous graph
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+When the dataset contains only one heterograph with 2 node/edge types respectively, there need
+only 5 files in the data folder: ``meta.yaml``, 2 CSV for nodes and 2 CSV for edges:
+
+.. code::
+
+    ./mini_hetero_dataset/
+    |-- meta.yaml
+    |-- nodes_0.csv
+    |-- nodes_1.csv
+    |-- edges_0.csv
+    |-- edges_1.csv
+
+``meta.yaml``
+For heterogeneous graph, ``etype`` and ``ntype`` are MUST HAVE and UNIQUE in ``edge_data`` and
+``node_data`` respectively, or only the last etype/ntype is kept when generating graph as all
+of them use the same default etype/ntype name.
+
+.. code:: yaml
+
+    dataset_name: mini_hetero_dataset
+    edge_data:
+    - file_name: edges_0.csv
+      etype:
+      - user
+      - follow
+      - user
+    - file_name: edges_1.csv
+      etype:
+      - user
+      - like
+      - item
+    node_data:
+    - file_name: nodes_0.csv
+      ntype: user
+    - file_name: nodes_1.csv
+      ntype: item
+
+``edges_0.csv``, ``edges_1.csv`` (Both are the same, just for example only.)
+
+.. code::
+
+    src_id,dst_id,label,feat
+    4,4,1,"0.736833152378035,0.10522806046048205,0.9418796835016118"
+    3,4,2,"0.5749339182767451,0.20181320245665535,0.490938012147181"
+    1,4,2,"0.7697294432580938,0.49397782380750765,0.10864079337442234"
+    0,4,0,"0.1364240150959487,0.1393107840629273,0.7901988878812207"
+    2,3,1,"0.42988138237505735,0.18389137408509248,0.18431292077750894"
+    0,4,2,"0.8613368738351794,0.67985810014162,0.6580438064356824"
+    2,4,1,"0.6594951663841697,0.26499036865016423,0.7891429392727503"
+    4,1,0,"0.36649684241348557,0.9511783938523962,0.8494919263589972"
+    1,1,2,"0.698592283371875,0.038622249776255946,0.5563827995742111"
+    0,4,1,"0.5227112950269823,0.3148264185956532,0.47562693094002173"
+
+``nodes_0.csv``, ``nodes_1.csv`` (Both are the same, just for example only.)
+
+.. code::
+
+    node_id,label,feat
+    0,2,"0.5400687466285844,0.7588441197954202,0.4268254673041745"
+    1,1,"0.08680051341900807,0.11446843700743892,0.7196969604886617"
+    2,2,"0.8964389655603473,0.23368113896545695,0.8813472954005022"
+    3,1,"0.5454703921677284,0.7819383771535038,0.3027939452162367"
+    4,1,"0.5365210052235699,0.8975240205792763,0.7613943085507672"
+
+After loaded, the dataset has one heterograph with features and labels:
+
+.. code:: python
+
+    import dgl
+    dataset = dgl.data.DGLCSVDataset('./mini_hetero_dataset')
+    g = dataset[0]  # only one graph
+    print(g)
+    #Graph(num_nodes={'item': 5, 'user': 5},
+    #    num_edges={('user', 'follow', 'user'): 10, ('user', 'like', 'item'): 10},
+    #    metagraph=[('user', 'user', 'follow'), ('user', 'item', 'like')])
+    g.nodes['user'].data
+    #{'label': tensor([2, 1, 2, 1, 1]), 'feat': tensor([[0.5401, 0.7588, 0.4268],
+    #        [0.0868, 0.1145, 0.7197],
+    #        [0.8964, 0.2337, 0.8813],
+    #        [0.5455, 0.7819, 0.3028],
+    #        [0.5365, 0.8975, 0.7614]], dtype=torch.float64)}
+    g.edges['like'].data
+    #{'label': tensor([1, 2, 2, 0, 1, 2, 1, 0, 2, 1]), 'feat': tensor([[0.7368, 0.1052, 0.9419],
+    #        [0.5749, 0.2018, 0.4909],
+    #        [0.7697, 0.4940, 0.1086],
+    #        [0.1364, 0.1393, 0.7902],
+    #        [0.4299, 0.1839, 0.1843],
+    #        [0.8613, 0.6799, 0.6580],
+    #        [0.6595, 0.2650, 0.7891],
+    #        [0.3665, 0.9512, 0.8495],
+    #        [0.6986, 0.0386, 0.5564],
+    #        [0.5227, 0.3148, 0.4756]], dtype=torch.float64)}
+
+Dataset of multiple graphs
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+When the dataset contains multiple graphs(for now, only homograph is supported) with node/edge/graph
+level features, there need only 4 files in the data folder: ``meta.yaml``, one CSV file for
+nodes/edge/graphs respectively:
+
+.. code::
+
+    ./mini_multi_dataset/
+    |-- meta.yaml
+    |-- nodes.csv
+    |-- edges.csv
+    |-- graphs.csv
+
+``meta.yaml``:
+
+.. code:: yaml
+
+    dataset_name: mini_multi_dataset
+    edge_data:
+    - file_name: edges.csv
+    node_data:
+    - file_name: nodes.csv
+    graph_data:
+    file_name: graphs.csv
+
+.. note::
+    ``graph_id`` should be specified in nodes/edges/graphs CSV files or default value ``0`` is
+    used instead which probably caused unexpected/undefined behavior.
+
+``edges.csv``:
+
+.. code::
+
+    graph_id,src_id,dst_id,feat
+    0,0,4,"0.39534097273254654,0.9422093637539785,0.634899790318452"
+    0,3,0,"0.04486384200747007,0.6453746567017163,0.8757520744192612"
+    0,3,2,"0.9397636966928355,0.6526403892728874,0.8643238446466464"
+    0,1,1,"0.40559906615287566,0.9848072295736628,0.493888090726854"
+    0,4,1,"0.253458867276219,0.9168191778828504,0.47224962583565544"
+    0,0,1,"0.3219496197945605,0.3439899477636117,0.7051530741717352"
+    0,2,1,"0.692873149428549,0.4770019763881086,0.21937428942781778"
+    0,4,0,"0.620118223673067,0.08691420300562658,0.86573472329756"
+    0,2,1,"0.00743445923710373,0.5251800239734318,0.054016385555202384"
+    0,4,1,"0.6776417760682221,0.7291568018841328,0.4523600060547709"
+    1,1,3,"0.6375445528248924,0.04878384701995819,0.4081642382536248"
+    1,0,4,"0.776002616178397,0.8851294998284638,0.7321742043493028"
+    1,1,0,"0.0928555079874982,0.6156748364694707,0.6985674921582508"
+    1,0,2,"0.31328748118329997,0.8326121496142408,0.04133991340612775"
+    1,1,0,"0.36786902637778773,0.39161865931662243,0.9971749359397111"
+    1,1,1,"0.4647410679872376,0.8478810655406659,0.6746269314422184"
+    1,0,2,"0.8117650553546695,0.7893727601272978,0.41527155506593394"
+    1,1,3,"0.40707309111756307,0.2796588354307046,0.34846782265758314"
+    1,1,0,"0.18626464175355095,0.3523777809254057,0.7863421810531344"
+    1,3,0,"0.28357022069634585,0.13774964202156292,0.5913335505943637"
+
+``nodes.csv``:
+
+.. code::
+
+    graph_id,node_id,feat
+    0,0,"0.5725330322207948,0.8451870383322376,0.44412796119211184"
+    0,1,"0.6624186423087752,0.6118386331195641,0.7352138669985214"
+    0,2,"0.7583372765843964,0.15218126307872892,0.6810484348765842"
+    0,3,"0.14627522432017592,0.7457985352827006,0.1037097085190507"
+    0,4,"0.49037522512771525,0.8778998699783784,0.0911194482288028"
+    1,0,"0.11158102039672668,0.08543289788089736,0.6901745368284345"
+    1,1,"0.28367647637469273,0.07502571020414439,0.01217200152200748"
+    1,2,"0.2472495901894738,0.24285506608575758,0.6494437360242048"
+    1,3,"0.5614197853127827,0.059172654879085296,0.4692371689047904"
+    1,4,"0.17583413999295983,0.5191278830882644,0.8453123358491914"
+
+``graphs.csv``:
+
+.. code::
+
+    graph_id,feat,label
+    0,"0.7426272601929126,0.5197462471155317,0.8149104951283953",0
+    1,"0.534822233529295,0.2863627767733977,0.1154897249106891",0
+
+
+After loaded, the dataset has multiple homographs with features and labels:
+
+.. code:: python
+
+    import dgl
+    dataset = dgl.data.DGLCSVDataset('./mini_multi_dataset')
+    print(len(dataset))
+    #2
+    graph, label = dataset[0]
+    print(graph, label)
+    #Graph(num_nodes=5, num_edges=10,
+    #    ndata_schemes={'feat': Scheme(shape=(3,), dtype=torch.float64)}
+    #    edata_schemes={'feat': Scheme(shape=(3,), dtype=torch.float64)}) tensor(0)
+    print(dataset.data)
+    #{'feat': tensor([[0.7426, 0.5197, 0.8149],
+    #        [0.5348, 0.2864, 0.1155]], dtype=torch.float64), 'label': tensor([0, 0])}
+
+YAML Specification
+~~~~~~~~~~~~~~~~~~
+
+Example
+^^^^^^^
+
+In the YAML file below, all supported keys are listed together including those that have default
+values though not all the keys are required for a specific use.
+
+.. code:: yaml
+
+    version: 1.0.0
+    dataset_name: full_yaml
+    separator: ','
+    edge_data:
+    - file_name: edges_0.csv
+      etype:
+      - user
+      - follow
+      - user
+      src_id_field: src_id
+      dst_id_field: dst_id
+    - file_name: edges_1.csv
+      etype:
+      - user
+      - like
+      - item
+      src_id_field: src_id
+      dst_id_field: dst_id
+    node_data:
+    - file_name: nodes_0.csv
+      ntype: user
+      node_id_field: node_id
+    - file_name: nodes_1.csv
+      ntype: item
+      node_id_field: node_id
+    graph_data:
+      file_name: graphs.csv
+      graph_id_field: graph_id
+
+Top-level keys
+^^^^^^^^^^^^^^
+
+At the top level, only 6 keys are available.
+
+``version``
+Optional. String. It specifies which version of ``meta.yaml`` is used. more feature may be added and
+version is changed accordingly.
+
+``dataset_name``
+Required. String. It specifies the dataset name.
+
+``separator``
+Optional. String. It specifies how to parse data in CSV files. Default value: ``,``.
+
+``edge_data``
+Required. List of dict. It includes several sub-keys to help parse edges from CSV files.
+
+``node_data``
+Required. List of dict. It includes several sub-keys to help parse nodes from CSV files.
+
+``graph_data``
+Required. Dict. It includes several sub-keys to help parse graph-level information from CSV files.
+
+Keys for ``edge_data``
+^^^^^^^^^^^^^^^^^^^^^^
+
+``file_name``
+Required. String. It specifies the file name which stores edge data.
+
+``etype``
+Optional. List of string. It specifies the canonical edge type.
+
+``src_id_field``
+Optional. String. It specifies which column to be read for src ids. Default value: ``src_id``.
+
+``dst_id_field``
+Optional. String. It specifies which column to be read for dst ids. Default value: ``dst_id``.
+
+Keys for ``node_data``
+^^^^^^^^^^^^^^^^^^^^^^
+
+``file_name``
+Required. String. It specifies the file name which stores node data.
+
+``ntype``
+Optional. List of string. It specifies the canonical node type.
+
+``node_id_field``
+Optional. String. It specifies which column to be read for node ids. Default value: ``node_id``.
+
+Keys for ``graph_data``
+^^^^^^^^^^^^^^^^^^^^^^
+
+``file_name``
+Required. String. It specifies the file name which stores graph data.
+
+``graph_id_field``
+Optional. String. It specifies which column to be read for graph ids. Default value: ``graph_id``.
+
+Parse node/edge/grpah data on your own
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+In default, all the data are attached to ``g.ndata`` with the same key as column name in ``nodes.csv``.
+So does data in ``edges.csv``. Data is not formatted unless it's a string of float values(feature data
+is often of this format). For better experience, user is able to self-define node/edge/graph data parser
+and passed such callable instance while instantiating ``DGLCSVDataset``. Below is an example.
+
+``DataParser``:
+
+.. code:: python
+
+    import numpy as np
+    import ast
+    import pandas as pd
+
+    class SelfDefinedDataParser:
+        """Convert labels which are in string format into numeric values.
+        """
+        def __call__(self, df: pd.DataFrame):
+            data = {}
+            for header in df:
+                if 'Unnamed' in header:
+                    print("Unamed column is found. Ignored...")
+                    continue
+                dt = df[header].to_numpy().squeeze()
+                print("{},{}".format(header, dt))
+                if header == 'label':
+                    dt = np.array([1 if e == 'positive' else 0 for e in dt])
+                print("{},{}".format(header, dt))
+                data[header] = dt
+            return data
+
+Example:
+
+``customized_parser_dataset``:
+
+.. code::
+
+    ./customized_parser_dataset/
+    |-- meta.yaml
+    |-- nodes.csv
+    |-- edges.csv
+
+``meta.yaml``:
+
+.. code:: yaml
+
+    dataset_name: customized_parser_dataset
+    edge_data:
+    - file_name: edges.csv
+    node_data:
+    - file_name: nodes.csv
+
+``edges.csv``:
+
+.. code::
+
+    src_id,dst_id,label
+    4,0,positive
+    4,0,negative
+    0,3,positive
+    0,1,positive
+    0,2,negative
+    0,0,positive
+    2,2,negative
+    1,0,positive
+    3,0,negative
+    4,0,positive
+
+``nodes.csv``:
+
+.. code::
+
+    node_id,label
+    0,positive
+    1,negative
+    2,positive
+    3,negative
+    4,positive
+
+After loaded, the dataset has one graph with features and labels:
+
+.. code:: python
+
+    import dgl
+    dataset = dgl.data.DGLCSVDataset('./customized_parser_dataset', node_data_parser={'_V':SelfDefinedDataParser()}, edge_data_parser={('_V','_E','_V'):SelfDefinedDataParser()})
+    print(dataset[0].ndata['label'])
+    #tensor([1, 0, 1, 0, 1])
+    print(dataset[0].edata['label'])
+    #tensor([1, 0, 1, 1, 0, 1, 0, 1, 0, 1])
+
+FAQs:
+~~~~~
+
+What's the data type in CSV files?
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+All data is required to be numeric. Specifically, all data except features should be ``integer``.
+For Feature, it’s a ``string`` composed of ``float`` values. Such strings will be splitted and cast
+into float values.
+
+What if some lines in CSV have missing values in several fields?
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+It’s undefined behavior. Please make sure the data is complete.
+
+What if ``graph_id`` is not specified in CSV?
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+For a single graph, such field in ``edge_data`` and ``node_data`` is not used at all. So it’s ok
+to ignore it. For multiple graphs, ``graph_id`` should be provided, or all edge/node data will be
+regarded as ``graph_id = 0``. This usually is not what you expect.
diff --git a/docs/source/guide/data.rst b/docs/source/guide/data.rst
index 034798a2d97a..fe462a1d64cc 100644
--- a/docs/source/guide/data.rst
+++ b/docs/source/guide/data.rst
@@ -23,6 +23,7 @@ shows how to implement each component of it.
 * :ref:`guide-data-pipeline-process`
 * :ref:`guide-data-pipeline-savenload`
 * :ref:`guide-data-pipeline-loadogb`
+* :ref:`guide-data-pipeline-loadcsv`
 
 .. toctree::
     :maxdepth: 1
@@ -33,4 +34,5 @@ shows how to implement each component of it.
     data-download
     data-process
     data-savenload
-    data-loadogb
\ No newline at end of file
+    data-loadogb
+    data-loadcsv
\ No newline at end of file
diff --git a/python/dgl/data/csv_dataset.py b/python/dgl/data/csv_dataset.py
index 99c23e942df8..335515be0834 100644
--- a/python/dgl/data/csv_dataset.py
+++ b/python/dgl/data/csv_dataset.py
@@ -51,7 +51,13 @@ class CSVDataset(DGLDataset):
         any available graph-level data such as graph-level feature, labels.
 
     Examples
-    [TODO]: link to a detailed web page.
+    --------
+    ``meta.yaml`` and CSV files are under ``csv_dir``.
+    
+    >>> csv_dataset = dgl.data.DGLCSVDataset(csv_dir)
+
+    See more details in :ref:`guide-data-pipeline-loadcsv`.
+
     """
     META_YAML_NAME = 'meta.yaml'