diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 419d1a7..3e81f23 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -45,13 +45,13 @@ jobs:
 
     - name: Build documentation
       run: |
-        python -m sphinx docs/ docs/_build/ -b html
+        python -m sphinx docs/ build/html -b html
 
     - name: Deploy documentation to Github pages
       uses: peaceiris/actions-gh-pages@v4
       with:
         github_token: ${{ secrets.GITHUB_TOKEN }}
-        publish_dir: ./docs/_build
+        publish_dir: ./build/html
 
     # Github release
     - name: Read CHANGELOG
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 3c08a24..3e3b888 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -59,9 +59,9 @@ jobs:
       if: matrix.tasks == 'docs'
 
     - name: Test building documentation
-      run: python -m sphinx docs/ docs/_build/ -b html -W
+      run: python -m sphinx docs/ build/html -b html -W
       if: matrix.tasks == 'docs'
 
     - name: Check links in documentation
-      run: python -m sphinx docs/ docs/_build/ -b linkcheck -W
+      run: python -m sphinx docs/ build/html -b linkcheck -W
       if: matrix.tasks == 'docs'
diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst
index 54bd517..f059a3f 100644
--- a/CONTRIBUTING.rst
+++ b/CONTRIBUTING.rst
@@ -88,14 +88,14 @@ You can install it and a few other necessary packages with::
 
 To create the HTML pages, use::
 
-   python -m sphinx docs/ build/sphinx/html -b html
+   python -m sphinx docs/ build/html -b html
 
 The generated files will be available
 in the directory :file:`build/sphinx/html/`.
 
 It is also possible to automatically check if all links are still valid::
 
-   python -m sphinx docs/ build/sphinx/html -b linkcheck
+   python -m sphinx docs/ build/html -b linkcheck
 
 .. _Sphinx: http://sphinx-doc.org
 
diff --git a/audpsychometric/__init__.py b/audpsychometric/__init__.py
index 22e55ae..40cd060 100644
--- a/audpsychometric/__init__.py
+++ b/audpsychometric/__init__.py
@@ -1,5 +1,3 @@
-import audpsychometric.core
-from audpsychometric.core import datasets
 from audpsychometric.core.datasets import list_datasets
 from audpsychometric.core.datasets import read_dataset
 from audpsychometric.core.gold_standard import agreement_categorical
@@ -7,7 +5,6 @@
 from audpsychometric.core.gold_standard import evaluator_weighted_estimator
 from audpsychometric.core.gold_standard import mode
 from audpsychometric.core.gold_standard import rater_agreement_pearson
-import audpsychometric.core.reliability
 from audpsychometric.core.reliability import congeneric_reliability
 from audpsychometric.core.reliability import cronbachs_alpha
 from audpsychometric.core.reliability import intra_class_correlation
diff --git a/audpsychometric/core/datasets/__init__.py b/audpsychometric/core/datasets/__init__.py
index faa1be1..3581490 100644
--- a/audpsychometric/core/datasets/__init__.py
+++ b/audpsychometric/core/datasets/__init__.py
@@ -1,6 +1,3 @@
-"""Provide example datasets for package."""
-
-
 __all__ = ["read_dataset", "list_dataset"]
 
 import os
@@ -19,12 +16,20 @@ def read_dataset(data_set_name: str) -> pd.DataFrame:
     retrieves a test dataset from within the package.
 
     Args:
-        data_set_name(str): string identifier of the dataset.
-        This does not need not be identical with the filename
+        data_set_name(str): dataset name
 
     Returns:
-        table containing dataset
-
+        dataframe containing dataset
+
+    Examples:
+        >>> df = read_dataset("wine")
+        >>> df.head()
+           Wine Judge  Scores
+        0     1     A       1
+        1     2     A       1
+        2     3     A       3
+        3     4     A       6
+        4     5     A       6
 
     """
     ds = data_sets.loc[data_sets["dataset"] == data_set_name]
@@ -38,11 +43,22 @@ def read_dataset(data_set_name: str) -> pd.DataFrame:
 def list_datasets():
     r"""List tests datasets available in package.
 
-    Args:
-        None
     Returns:
-        table listing available datasets
-
-    """
+        dataframe listing available datasets
+
+    Examples:
+        >>> list_datasets()
+                                                     fname  ...                   description
+        dataset                                             ...
+        statology                            statology.csv  ...      icc sample from web page
+        hallgren-table5              Hallgren-Table-05.csv  ...    icc table from publication
+        hallgren-table3              Hallgren-Table-03.csv  ...  kappa table from publication
+        HolzingerSwineford1939  HolzingerSwineford1939.csv  ...                        lavaan
+        Shrout_Fleiss               Shrout_Fleiss_1979.csv  ...            Dataset from paper
+        wine                                      wine.csv  ...                 online source
+        <BLANKLINE>
+        [6 rows x 4 columns]
+
+    """  # noqa: E501
     df_data_sets = data_sets.set_index("dataset")
     return df_data_sets
diff --git a/audpsychometric/core/reliability.py b/audpsychometric/core/reliability.py
index d6ab942..099b5b8 100644
--- a/audpsychometric/core/reliability.py
+++ b/audpsychometric/core/reliability.py
@@ -126,6 +126,9 @@ def intra_class_correlation(
     The model is based on analysis of variance,
     and ratings must at least be ordinally scaled.
 
+    CCC_ is conceptually and numerically related to the ICC.
+    For an implementation see :func:`audmetric.concordance_cc`.
+
     Args:
         ratings: ratings.
             When given as a 1-dimensional array,
@@ -137,6 +140,8 @@ def intra_class_correlation(
         anova_method: method for ANOVA calculation,
             can be ``"pingouin"`` or ``"statsmodels"``
 
+    .. _CCC: https://en.wikipedia.org/wiki/Concordance_correlation_coefficient
+
     Returns:
         icc and additional results lumped into dict
 
@@ -288,20 +293,6 @@ def intra_class_correlation(
         - :math:`k` is the number of raters
         - :math:`n` is the number of items
 
-        **Implementation Details**
-
-        For doing the analysis,
-        the :class:`pd.DataFrame` is preprocessed:
-        The first step will melt
-        and the data into a long format
-        for checking incomplete cases.
-        In this process,
-        the index  will be renamed to a column item
-        to mimic classical test theory conventions.
-        The raters will end up in a separate column
-        containing the ratings.
-        Ratings will be available under the column rating.
-
     """  # noqa: E501
     if not isinstance(ratings, pd.DataFrame):
         df = pd.DataFrame(np.atleast_2d(np.array(ratings)))
diff --git a/docs/api-src/audpsychometric.rst b/docs/api-src/audpsychometric.rst
index f58da5d..5d49bf9 100644
--- a/docs/api-src/audpsychometric.rst
+++ b/docs/api-src/audpsychometric.rst
@@ -3,84 +3,17 @@ audpsychometric
 
 .. automodule:: audpsychometric
 
-Library to facilitate evaluation and processing of annotated speech.
-
-Pychometric Analysis
---------------------
-
-.. autosummary::
-    :toctree:
-    :nosignatures:
-
-    cronbachs_alpha
-    congeneric_reliability
-    intra_class_correlation
-
-The module currently contains two reliability coefficients
-from the family of structural equation model (SEM)-based
-reliability coefficients.
-One of them is Cronbach's alphas
-in the function :func:`audpsychometric.cronbachs_alpha`.
-This classical coefficient assumes *tau equivalence*
-which requires factor loadings to be homogeneous.
-The second coefficient
-in the function :func:`audpsychometric.congeneric_reliability`
-relaxes this assumption
-and only assumes a `one-dimensional congeneric reliability`_ model:
-congeneric measurement models are characterized by the fact
-that the factor loadings of the indicators
-do not have to be homogeneous,
-i.e. they can differ.
-
-In addition,
-the module implements *Intraclass Correlation (ICC)* analysis.
-ICC is based on the analysis of variance of a class of coefficients
-that are based on ANOVA
-with ratings as the dependent variable,
-and terms for targets
-(like e.g rated audio chunks),
-raters and their interaction are estimated.
-Different flavors of ICC are then computed
-based on these sum of squares terms.
-
-Note that the CCC_ is conceptually and numerically related to the ICC.
-We do not implement it here,
-as there are other implementations available,
-e.g. :func:`audmetric.concordance_cc`.
-
-
-Gold Standard Calculation
--------------------------
-
 .. autosummary::
     :toctree:
     :nosignatures:
 
     agreement_categorical
     agreement_numerical
+    cronbachs_alpha
+    congeneric_reliability
     evaluator_weighted_estimator
+    intra_class_correlation
+    list_datasets
     mode
     rater_agreement_pearson
-
-
-Demo Datasets
--------------
-
-.. autosummary::
-    :toctree:
-    :nosignatures:
-
-    list_datasets
     read_dataset
-
-Currently these datasets are defined:
-
-.. jupyter-execute::
-
-    from audpsychometric import datasets
-    df_datasets = datasets.list_datasets()
-    print(df_datasets)
-
-
-.. _one-dimensional congeneric reliability: https://en.wikipedia.org/wiki/Congeneric_reliability
-.. _CCC: https://en.wikipedia.org/wiki/Concordance_correlation_coefficient
diff --git a/docs/conf.py b/docs/conf.py
index b2f86a3..f09f506 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -26,7 +26,6 @@
 ]
 pygments_style = None
 extensions = [
-    "jupyter_sphinx",  # executing code blocks
     "sphinx.ext.autodoc",
     "sphinx.ext.napoleon",  # support for Google-style docstrings
     "sphinx.ext.viewcode",
diff --git a/docs/requirements.txt b/docs/requirements.txt
index f1e898d..bb43359 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1,11 +1,8 @@
 audeer
-ipykernel
-jupyter-sphinx
 sphinx
 sphinx-apipages >=0.1.2
 sphinx-audeering-theme >=1.2.1
 sphinx-autodoc-typehints
 sphinx-copybutton
-sphinxcontrib-programoutput
 sphinxcontrib-bibtex
 toml
diff --git a/tests/conftest.py b/tests/conftest.py
index b83ce14..09729a1 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,12 +1,12 @@
 import numpy as np
 import pytest
 
-from audpsychometric import datasets
+import audpsychometric
 
 
 @pytest.fixture(scope="function")
 def df_holzinger_swineford():
-    df_dataset = datasets.read_dataset("HolzingerSwineford1939")
+    df_dataset = audpsychometric.read_dataset("HolzingerSwineford1939")
     cols_use = [col for col in df_dataset.columns if col.startswith("x")]
     df = df_dataset[cols_use].astype(np.float32)
     return df
diff --git a/tests/test_dataset.py b/tests/test_dataset.py
index 744a51a..a295271 100644
--- a/tests/test_dataset.py
+++ b/tests/test_dataset.py
@@ -7,7 +7,7 @@
 
 def test_list_datasets():
     """First basic dataset is available in dataset list."""
-    df_datasets = audpsychometric.datasets.list_datasets()
+    df_datasets = audpsychometric.list_datasets()
     assert "statology" in df_datasets.index
 
 
@@ -24,5 +24,5 @@ def test_list_datasets():
 )
 def test_read_dataset(dataset):
     """Test functional requirement that a dataset can be read into dataframe."""
-    df_dataset = audpsychometric.datasets.read_dataset(dataset)
+    df_dataset = audpsychometric.read_dataset(dataset)
     assert isinstance(df_dataset, pd.DataFrame)
diff --git a/tests/test_reliability.py b/tests/test_reliability.py
index 65eab23..d9f7c30 100644
--- a/tests/test_reliability.py
+++ b/tests/test_reliability.py
@@ -7,7 +7,7 @@
 
 def test_icc():
     """Test icc basic result validity."""
-    df_dataset = audpsychometric.datasets.read_dataset("wine")
+    df_dataset = audpsychometric.read_dataset("wine")
 
     data_wide = df_dataset.pivot_table(index="Wine", columns="Judge", values="Scores")
 
@@ -24,7 +24,7 @@ def test_icc():
 
 def test_cronbachs_alpha():
     """Test cronbach's alpha return values for three raters."""
-    df_dataset = audpsychometric.datasets.read_dataset("hallgren-table3")
+    df_dataset = audpsychometric.read_dataset("hallgren-table3")
     df = df_dataset[["Dep_Rater1", "Dep_Rater2", "Dep_Rater3"]]
     for ratings in [df, df.values]:
         alpha, result = audpsychometric.cronbachs_alpha(ratings)
@@ -56,7 +56,7 @@ def test_anova_helper():
 
 def test_icc_nanremoval():
     """Cover nan removal if statement."""
-    df_dataset = audpsychometric.datasets.read_dataset("HolzingerSwineford1939")
+    df_dataset = audpsychometric.read_dataset("HolzingerSwineford1939")
     df_dataset = df_dataset[[x for x in df_dataset.columns if x.startswith("x")]]
     nan_mat = np.random.random(df_dataset.shape) < 0.1
     audpsychometric.intra_class_correlation(df_dataset.mask(nan_mat))