diff --git a/.github/workflows/linter.yml b/.github/workflows/linter.yml
new file mode 100644
index 0000000..9fa4a65
--- /dev/null
+++ b/.github/workflows/linter.yml
@@ -0,0 +1,29 @@
+name: Linter
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Set up Python 3.10
+      uses: actions/setup-python@v5
+      with:
+        python-version: '3.10'
+
+    - name: Install pre-commit hooks
+      run: |
+        pip install pre-commit
+        pre-commit install --install-hooks
+
+    - name: Code style check via pre-commit
+      run: |
+        pre-commit run --all-files
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 8ac49e3..419d1a7 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -1,4 +1,4 @@
-name: Publish
+name: Publish Python Package
 
 on:
   push:
@@ -8,31 +8,36 @@ on:
 jobs:
   deploy:
     runs-on: ubuntu-latest
+    environment: release
+    permissions:
+      contents: write
+      id-token: write
+    concurrency:
+      group: ${{ github.workflow }}-${{ github.ref }}
+
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v4
       with:
         fetch-depth: 2
 
     - name: Set up Python
-      uses: actions/setup-python@v2
+      uses: actions/setup-python@v5
       with:
-        python-version: '3.8'
+        python-version: '3.10'
 
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        pip install setuptools wheel twine
+        pip install build virtualenv
 
     # PyPI package
-    - name: Build and publish
-      env:
-        TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
-        TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
-      run: |
-        python setup.py sdist bdist_wheel
-        python -m twine upload dist/*
+    - name: Build Python package
+      run: python -m build
 
-    # Docuemntation
+    - name: Publish Python package to PyPI
+      uses: pypa/gh-action-pypi-publish@release/v1
+
+    # Documentation
     - name: Install doc dependencies
       run: |
         pip install -r requirements.txt
@@ -43,7 +48,7 @@ jobs:
         python -m sphinx docs/ docs/_build/ -b html
 
     - name: Deploy documentation to Github pages
-      uses: peaceiris/actions-gh-pages@v3
+      uses: peaceiris/actions-gh-pages@v4
       with:
         github_token: ${{ secrets.GITHUB_TOKEN }}
         publish_dir: ./docs/_build
@@ -54,20 +59,21 @@ jobs:
       run: |
         # Get bullet points from last CHANGELOG entry
         CHANGELOG=$(git diff -U0 HEAD^ HEAD | grep '^[+][\* ]' | sed 's/\+//')
-        # Support for multiline, see
-        # https://github.com/actions/create-release/pull/11#issuecomment-640071918
-        CHANGELOG="${CHANGELOG//'%'/'%25'}"
-        CHANGELOG="${CHANGELOG//$'\n'/'%0A'}"
-        CHANGELOG="${CHANGELOG//$'\r'/'%0D'}"
         echo "Got changelog: $CHANGELOG"
-        echo "::set-output name=body::$CHANGELOG"
+        # Support for multiline, see
+        # https://docs.github.com/en/actions/using-workflows/workflow-commands-for-github-actions#multiline-strings
+        {
+          echo 'body<<EOF'
+          echo "$CHANGELOG"
+          echo EOF
+        } >> "$GITHUB_OUTPUT"
 
     - name: Create release on Github
       id: create_release
-      uses: actions/create-release@v1
+      uses: softprops/action-gh-release@v2
       env:
         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
       with:
         tag_name: ${{ github.ref }}
-        release_name: Release ${{ github.ref }}
+        name: Release ${{ github.ref_name }}
         body: ${{ steps.changelog.outputs.body }}
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 8e0fb19..3c08a24 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -13,21 +13,24 @@ jobs:
     strategy:
       matrix:
         os: [ ubuntu-latest, macOS-latest, windows-latest ]
-        python-version: [ 3.8 ]
+        python-version: [ "3.10" ]
         tasks: [ tests ]
         include:
           - os: ubuntu-latest
-            python-version: 3.9
+            python-version: "3.8"
             tasks: tests
           - os: ubuntu-latest
-            python-version: 3.8
+            python-version: "3.9"
+            tasks: tests
+          - os: ubuntu-latest
+            python-version: "3.10"
             tasks: docs
 
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v4
 
     - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
+      uses: actions/setup-python@v5
       with:
         python-version: ${{ matrix.python-version }}
 
@@ -45,7 +48,7 @@ jobs:
       if: matrix.tasks == 'tests'
 
     - name: Upload coverage to Codecov
-      uses: codecov/codecov-action@v1
+      uses: codecov/codecov-action@v4
       with:
         token: ${{ secrets.CODECOV_TOKEN }}
         file: ./coverage.xml
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000..119cb13
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,26 @@
+# Configuration of checks run by pre-commit
+#
+# The tests are executed in the CI pipeline,
+# see CONTRIBUTING.rst for further instructions.
+# You can also run the checks directly at the terminal, e.g.
+#
+# $ pre-commit install
+# $ pre-commit run --all-files
+#
+#
+default_language_version:
+  python: python3.10
+
+repos:
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.1.8
+    hooks:
+      - id: ruff
+        args: [ --fix ]
+      - id: ruff-format
+  - repo: https://github.com/codespell-project/codespell
+    rev: v2.2.4
+    hooks:
+      - id: codespell
+        additional_dependencies:
+          - tomli
diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst
index 0665601..54bd517 100644
--- a/CONTRIBUTING.rst
+++ b/CONTRIBUTING.rst
@@ -1,68 +1,104 @@
 Contributing
 ============
 
-If you would like to add new functionality fell free to create a `pull
-request`_ . If you find errors, omissions, inconsistencies or other things
-that need improvement, please create an issue_.
-Contributions are always welcome!
+Everyone is invited to contribute to this project.
+Feel free to create a `pull request`_ .
+If you find errors,
+omissions,
+inconsistencies,
+or other things
+that need improvement,
+please create an issue_.
+
+.. _issue: https://github.com/audeering/audpsychometric/issues/new/
+.. _pull request: https://github.com/audeering/audpsychometric/compare/
 
-.. _issue:
-    https://github.com/audeering/audpsychometric/issues/new/
-.. _pull request:
-    https://github.com/audeering/audpsychometric/compare/
 
 Development Installation
 ------------------------
 
-Instead of pip-installing the latest release from PyPI, you should get the
-newest development version from Github_::
+Instead of pip-installing the latest release from PyPI_,
+you should get the newest development version from Github_::
+
+   git clone https://github.com/audeering/audpsychometric/
+   cd audpsychometric
+   # Create virtual environment for this project
+   # e.g.
+   # virtualenv --python="python3"  $HOME/.envs/audpsychometric
+   # source $HOME/.envs/audpsychometric/bin/activate
+   pip install -r requirements.txt
+
 
-    git clone https://github.com/audeering/audpsychometric/
-    cd audpsychometric
-    # Use virtual environment
-    pip install -r requirements.txt
+This way,
+your installation always stays up-to-date,
+even if you pull new changes from the Github repository.
 
+.. _PyPI: https://pypi.org/project/audpsychometric/
 .. _Github: https://github.com/audeering/audpsychometric/
 
-This way, your installation always stays up-to-date,
-even if you pull new changes
-from the repository.
 
-Building the Documentation
---------------------------
+Coding Convention
+-----------------
 
-If you make changes to the documentation, you can re-create the HTML pages
-using Sphinx_.
-You can install it and a few other necessary packages with::
+We follow the PEP8_ convention for Python code
+and use ruff_ as a linter and code formatter.
+In addition,
+we check for common spelling errors with codespell_.
+Both tools and possible exceptions
+are defined in :file:`pyproject.toml`.
 
-    pip install -r requirements.txt
-    pip install -r docs/requirements.txt
+The checks are executed in the CI using `pre-commit`_.
+You can enable those checks locally by executing::
 
-To create the HTML pages, use::
+    pip install pre-commit  # consider system wide installation
+    pre-commit install
+    pre-commit run --all-files
+
+Afterwards ruff_ and codespell_ are executed
+every time you create a commit.
 
-	python -m sphinx docs/ build/sphinx/html -b html
+You can also install ruff_ and codespell_
+and call it directly::
 
-The generated files will be available in the directory ``build/sphinx/html/``.
+    pip install ruff codespell  # consider system wide installation
+    ruff check --fix .  # lint all Python files, and fix any fixable errors
+    ruff format .  # format code of all Python files
+    codespell
 
-.. Note::
+It can be restricted to specific folders::
 
-    During the default building of the documentation
-    Jupyter notebooks are not executed to save time.
+    ruff check audfoo/ tests/
+    codespell audfoo/ tests/
 
-To execute the notebooks as well, copy and paste
-the following into your terminal and press the enter key::
 
-    python -m sphinx -W docs/ \
-        -D nbsphinx_execute='always' \
-        -d build/sphinx/doctrees \
-        build/sphinx/html \
-        -b html
+.. _codespell: https://github.com/codespell-project/codespell/
+.. _PEP8: http://www.python.org/dev/peps/pep-0008/
+.. _pre-commit: https://pre-commit.com
+.. _ruff: https://beta.ruff.rs
+
+
+Building the Documentation
+--------------------------
+
+If you make changes to the documentation,
+you can re-create the HTML pages using Sphinx_.
+You can install it and a few other necessary packages with::
+
+   pip install -r docs/requirements.txt
+
+To create the HTML pages, use::
+
+   python -m sphinx docs/ build/sphinx/html -b html
+
+The generated files will be available
+in the directory :file:`build/sphinx/html/`.
 
 It is also possible to automatically check if all links are still valid::
 
-    python -m sphinx docs/ build/sphinx/linkcheck -b linkcheck
+   python -m sphinx docs/ build/sphinx/html -b linkcheck
+
+.. _Sphinx: http://sphinx-doc.org
 
-.. _Sphinx: http://sphinx-doc.org/
 
 Running the Tests
 -----------------
@@ -70,14 +106,13 @@ Running the Tests
 You'll need pytest_ for that.
 It can be installed with::
 
-    pip install -r tests/requirements.txt
+   pip install -r tests/requirements.txt
 
 To execute the tests, simply run::
 
-    python -m pytest
+   python -m pytest
 
-.. _pytest:
-    https://pytest.org/
+.. _pytest: https://pytest.org
 
 
 Creating a New Release
@@ -87,5 +122,5 @@ New releases are made using the following steps:
 
 #. Update ``CHANGELOG.rst``
 #. Commit those changes as "Release X.Y.Z"
-#. Create an (annotated) tag with ``git tag -a vX.Y.Z``
+#. Create an (annotated) tag with ``git tag -a X.Y.Z``
 #. Push the commit and the tag to Github
diff --git a/audpsychometric/__init__.py b/audpsychometric/__init__.py
index 889c011..da18365 100644
--- a/audpsychometric/__init__.py
+++ b/audpsychometric/__init__.py
@@ -16,24 +16,19 @@
 
 """
 import audpsychometric.core
-import audpsychometric.core.reliability
 from audpsychometric.core import datasets
-from audpsychometric.core.gold_standard import (
-    evaluator_weighted_estimator,
-    rater_confidence_pearson,
-    gold_standard_mean,
-    gold_standard_median,
-    gold_standard_mode,
-)
-from audpsychometric.core.reliability import (
-    congeneric_reliability,
-    cronbachs_alpha,
-    intra_class_correlation
-)
-
-from audpsychometric.core.datasets import (
-    list_datasets,
-    read_dataset)
+from audpsychometric.core.datasets import list_datasets
+from audpsychometric.core.datasets import read_dataset
+from audpsychometric.core.gold_standard import evaluator_weighted_estimator
+from audpsychometric.core.gold_standard import gold_standard_mean
+from audpsychometric.core.gold_standard import gold_standard_median
+from audpsychometric.core.gold_standard import gold_standard_mode
+from audpsychometric.core.gold_standard import rater_confidence_pearson
+import audpsychometric.core.reliability
+from audpsychometric.core.reliability import congeneric_reliability
+from audpsychometric.core.reliability import cronbachs_alpha
+from audpsychometric.core.reliability import intra_class_correlation
+
 
 # Disencourage from audpsychometric import *
 __all__ = []
diff --git a/audpsychometric/core/__init__.py b/audpsychometric/core/__init__.py
index af01988..e69de29 100644
--- a/audpsychometric/core/__init__.py
+++ b/audpsychometric/core/__init__.py
@@ -1,3 +0,0 @@
-"""
-The core module
-"""
diff --git a/audpsychometric/core/datasets/__init__.py b/audpsychometric/core/datasets/__init__.py
index f8afbcf..faa1be1 100644
--- a/audpsychometric/core/datasets/__init__.py
+++ b/audpsychometric/core/datasets/__init__.py
@@ -1,5 +1,4 @@
-"""Provide example datasets for package
-"""
+"""Provide example datasets for package."""
 
 
 __all__ = ["read_dataset", "list_dataset"]
@@ -8,13 +7,14 @@
 
 import pandas as pd
 
+
 data_directory = os.path.dirname(os.path.realpath(__file__))
-dataset_path = os.path.join(data_directory, 'datasets.csv')
-data_sets = pd.read_csv(dataset_path, sep=',')
+dataset_path = os.path.join(data_directory, "datasets.csv")
+data_sets = pd.read_csv(dataset_path, sep=",")
 
 
 def read_dataset(data_set_name: str) -> pd.DataFrame:
-    r"""read dataset identified by name.
+    r"""Read dataset identified by name.
 
     retrieves a test dataset from within the package.
 
@@ -27,24 +27,22 @@ def read_dataset(data_set_name: str) -> pd.DataFrame:
 
 
     """
-
     ds = data_sets.loc[data_sets["dataset"] == data_set_name]
 
-    fname = ds['fname'].values[0]
+    fname = ds["fname"].values[0]
     fpath = os.path.join(data_directory, fname)
-    df = pd.read_csv(fpath, sep=',')
+    df = pd.read_csv(fpath, sep=",")
     return df
 
 
 def list_datasets():
-    r'''List tests datasets available in package
+    r"""List tests datasets available in package.
 
     Args:
         None
     Returns:
         table listing available datasets
 
-    '''
-
-    df_data_sets = data_sets.set_index('dataset')
+    """
+    df_data_sets = data_sets.set_index("dataset")
     return df_data_sets
diff --git a/audpsychometric/core/gold_standard.py b/audpsychometric/core/gold_standard.py
index 20ea323..5d2e054 100644
--- a/audpsychometric/core/gold_standard.py
+++ b/audpsychometric/core/gold_standard.py
@@ -1,4 +1,4 @@
-"""Methods for calculating gold standards over individual raters' judgments"""
+r"""Calculating gold standards over individual raters' judgments."""
 
 
 import numpy as np
@@ -8,11 +8,11 @@
 
 
 def gold_standard_mean(
-        df: pd.DataFrame,
-        minimum: float,
-        maximum: float,
-        *,
-        axis: int = 1,
+    df: pd.DataFrame,
+    minimum: float,
+    maximum: float,
+    *,
+    axis: int = 1,
 ) -> pd.DataFrame:
     r"""Calculate the gold standard as the mean of raters' votes.
 
@@ -39,16 +39,16 @@ def gold_standard_mean(
     )
     gold_standard = df.mean(axis=axis)
     df_result = pd.concat([gold_standard, confidences], axis=1)
-    df_result.columns = ['gold_standard', 'confidence']
+    df_result.columns = ["gold_standard", "confidence"]
     return df_result
 
 
 def gold_standard_median(
-        df: pd.DataFrame,
-        minimum: float,
-        maximum: float,
-        *,
-        axis: int = 1,
+    df: pd.DataFrame,
+    minimum: float,
+    maximum: float,
+    *,
+    axis: int = 1,
 ) -> pd.DataFrame:
     r"""Calculate the gold standard as the median of raters' votes.
 
@@ -73,14 +73,14 @@ def gold_standard_median(
     )
     gold_standard = df.median(axis=axis)
     df_result = pd.concat([gold_standard, confidences], axis=1)
-    df_result.columns = ['gold_standard', 'confidence']
+    df_result.columns = ["gold_standard", "confidence"]
     return df_result
 
 
 def gold_standard_mode(
-        df: pd.DataFrame,
-        *,
-        axis: int = 1,
+    df: pd.DataFrame,
+    *,
+    axis: int = 1,
 ) -> pd.DataFrame:
     r"""Calculate the gold standard as the median of raters' votes.
 
@@ -96,7 +96,6 @@ def gold_standard_mode(
         table containing `gold_standard` and `confidence` columns
 
     """
-
     gold_standard = np.floor(df.mode(axis=axis).mean(axis=axis) + 0.5)
     df["gold"] = gold_standard
 
@@ -152,7 +151,7 @@ def evaluator_weighted_estimator(
     raters = df.columns.tolist()
 
     def ewe(row):
-        """functional to determine ewe per row"""
+        """Functional to determine ewe per row."""
         total = sum([row[x] * confidences[x] for x in raters])
         total /= np.sum([confidences[x] for x in raters])
         return total
@@ -182,7 +181,6 @@ def _confidence_categorical(row: pd.Series) -> float:
         categorical confidence score
 
     """
-
     columns = row.index.tolist()
 
     if "gold" not in columns:
@@ -193,11 +191,11 @@ def _confidence_categorical(row: pd.Series) -> float:
 
 
 def _confidence_numerical(
-        row: pd.Series,
-        minimum: float,
-        maximum: float,
+    row: pd.Series,
+    minimum: float,
+    maximum: float,
 ) -> float:
-    """Functional to calculate confidence score row-wise - numerical.
+    r"""Functional to calculate confidence score row-wise - numerical.
 
     .. math::
        confidence_\text{row} = max(0, 1 - std(row) / cutoff_max)
@@ -216,13 +214,13 @@ def _confidence_numerical(
     """
     raters = row.index.tolist()
     cutoff_max = maximum - 1 / 2 * (minimum + maximum)
-    return max([0., 1 - row[raters].std(ddof=0) / cutoff_max])
+    return max([0.0, 1 - row[raters].std(ddof=0) / cutoff_max])
 
 
 def rater_confidence_pearson(
-        df: pd.DataFrame,
-        *,
-        axis: int = 1,
+    df: pd.DataFrame,
+    *,
+    axis: int = 1,
 ) -> dict:
     """Calculate the rater confidence.
 
@@ -247,7 +245,6 @@ def rater_confidence_pearson(
         dict with the rater confidences
 
     """
-
     if axis == 0:
         df = df.T
 
diff --git a/audpsychometric/core/reliability.py b/audpsychometric/core/reliability.py
index 3671ecb..4dbe466 100644
--- a/audpsychometric/core/reliability.py
+++ b/audpsychometric/core/reliability.py
@@ -1,5 +1,4 @@
-"""Psychometric Functions for Interrater Reliability.
-"""
+"""Psychometric Functions for Interrater Reliability."""
 
 import typing
 
@@ -58,9 +57,7 @@ def cronbachs_alpha(df: pd.DataFrame) -> typing.Tuple[float, typing.Dict]:
     total_variance = total_score.var(ddof=1)  # var(X)
     alpha = n_items / (n_items - 1) * (1 - variance_sum / total_variance)
 
-    result = {
-        "total_variance": total_variance
-    }
+    result = {"total_variance": total_variance}
     return alpha, result
 
 
@@ -96,9 +93,9 @@ def congeneric_reliability(df: pd.DataFrame) -> typing.Tuple[float, typing.Dict]
     return reliability, result
 
 
-def intra_class_correlation(df: pd.DataFrame,
-                            icc_type: str = "ICC_1_1",
-                            anova_method: str = "pingouin") -> typing.Tuple[float, typing.Dict]:
+def intra_class_correlation(
+    df: pd.DataFrame, icc_type: str = "ICC_1_1", anova_method: str = "pingouin"
+) -> typing.Tuple[float, typing.Dict]:
     r"""Intraclass Correlation.
 
     Intraclass correlation calculates rating reliability by relating
@@ -111,6 +108,8 @@ def intra_class_correlation(df: pd.DataFrame,
     Args:
         df: table in wide format with one rater per column
         icc_type: ICC Method, see description below
+        anova_method: method for ANOVA calculation,
+            can be ``"pingouin"`` or ``"statsmodels"``
 
     Returns:
         icc and additional results lumped into dict
@@ -279,14 +278,12 @@ def intra_class_correlation(df: pd.DataFrame,
 
     """  # noqa: E501
 
-    def _anova(df_long: pd.DataFrame,
-               anova_method: str = "pingouin") -> pd.DataFrame:
+    def _anova(df_long: pd.DataFrame, anova_method: str = "pingouin") -> pd.DataFrame:
         """Helper to get the anova table.
 
         Note that pingouin is currently default as statsmodels.
         ols from the statsmodels package is slow under many circumstances
         """
-
         anova_methods = ["statsmodels", "pingouin"]
 
         if anova_method not in anova_methods:
@@ -330,9 +327,9 @@ def _anova(df_long: pd.DataFrame,
     # convert back to wide format to see how many raters dropped:
     data_wide = data_long.pivot_table(index="item", columns="rater", values="rating")
 
-    # delete missings in listwise manner
+    # delete missing data in listwise manner
     nan_count = data_wide.isna().sum().sum()
-    print(f"we have {nan_count} missings")
+    print(f"We have {nan_count} missing data points")
     print("Deleting them!")
     if nan_count > 0:
         data_wide = data_wide.dropna(axis=0, how="any")
@@ -378,15 +375,15 @@ def _anova(df_long: pd.DataFrame,
     icc_rater_type = 3 * ["single"] + 3 * ["average"]
     icc_rating_type = ["absolute", "consistency", "consistency"] * 2
     icc_effect_type = ["-", "random", "fixed"] * 2
-    ICC = [icc_1_1, icc_2_1, icc_3_1, icc_1_k, icc_2_k, icc_3_k]
+    icc = [icc_1_1, icc_2_1, icc_3_1, icc_1_k, icc_2_k, icc_3_k]
 
     results = pd.DataFrame(
-        [icc_types, icc_rater_type, icc_rating_type, icc_effect_type, ICC]
+        [icc_types, icc_rater_type, icc_rating_type, icc_effect_type, icc]
     ).T
 
     vars = ["icc_type", "rater type", "rating type", "anova effect type", "icc"]
     results_df = pd.DataFrame(
-        [icc_types, icc_rater_type, icc_rating_type, icc_effect_type, ICC], index=vars
+        [icc_types, icc_rater_type, icc_rating_type, icc_effect_type, icc], index=vars
     ).T
 
     results_table = results_df.to_dict("records")
@@ -402,9 +399,6 @@ def _anova(df_long: pd.DataFrame,
     #     "ICC": [icc_1_1, icc_2_1, icc_3_1, icc_1_k, icc_2_k, icc_3_k],
     # }
 
-    # idx = stats["Type"].index(icc_type)
-    # icc = stats["ICC"][idx]
-
     icc_dict = [x for x in results_table if icc_type == x["icc_type"]]
     icc = icc_dict[0]["icc"]
 
diff --git a/docs/conf.py b/docs/conf.py
index 4942361..b258881 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -1,46 +1,43 @@
-import configparser
-import os
-import subprocess
-from datetime import date
+import toml
 
 import audeer
 
-config = configparser.ConfigParser()
-config.read(os.path.join('..', 'setup.cfg'))
+
+config = toml.load(audeer.path("..", "pyproject.toml"))
 
 # Project -----------------------------------------------------------------
-author = config['metadata']['author']
-copyright = f'2020-{date.today().year} audEERING GmbH'
-project = config['metadata']['name']
+project = config["project"]["name"]
+author = ", ".join(author["name"] for author in config["project"]["authors"])
+copyright = "audEERING GmbH"
 version = audeer.git_repo_version()
-title = f'{project} Documentation'
+title = "Documentation"
 
 
 # General -----------------------------------------------------------------
-master_doc = 'index'
+master_doc = "index"
 extensions = []
-source_suffix = '.rst'
-exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store', '**.ipynb_checkpoints']
+source_suffix = ".rst"
+exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "**.ipynb_checkpoints"]
 pygments_style = None
 extensions = [
-    'jupyter_sphinx',  # executing code blocks
-    'sphinx.ext.autodoc',
-    'sphinx.ext.napoleon',  # support for Google-style docstrings
-    'sphinx.ext.viewcode',
-    'sphinx.ext.intersphinx',
-    'sphinx_autodoc_typehints',
-    'sphinx_copybutton',  # for "copy to clipboard" buttons
-    'sphinxcontrib.bibtex'
+    "jupyter_sphinx",  # executing code blocks
+    "sphinx.ext.autodoc",
+    "sphinx.ext.napoleon",  # support for Google-style docstrings
+    "sphinx.ext.viewcode",
+    "sphinx.ext.intersphinx",
+    "sphinx_autodoc_typehints",
+    "sphinx_copybutton",  # for "copy to clipboard" buttons
+    "sphinxcontrib.bibtex",
 ]
 
 
-bibtex_bibfiles = ['refs.bib']
-bibtex_default_style = 'alpha'
+bibtex_bibfiles = ["refs.bib"]
+bibtex_default_style = "alpha"
 
 
 # Ignore package dependencies during building the docs
 autodoc_mock_imports = [
-    'tqdm',
+    "tqdm",
 ]
 
 # Reference with :ref:`data-header:Database`
@@ -48,32 +45,33 @@
 autosectionlabel_maxdepth = 2
 
 # Do not copy prompot output
-copybutton_prompt_text = r'>>> |\.\.\. '
+copybutton_prompt_text = r">>> |\.\.\. "
 copybutton_prompt_is_regexp = True
 
 # Disable Gitlab as we need to sign in
 linkcheck_ignore = [
-    'https://gitlab.audeering.com',
-    r'.*evaluationdashboard.com/index.php/2012/09/22/*'
+    "https://gitlab.audeering.com",
+    r".*evaluationdashboard.com/index.php/2012/09/22/*",
+    "https://pypi.org/project/audpsychometric/",  # no package published yet
 ]
 
 
 # HTML --------------------------------------------------------------------
-html_theme = 'sphinx_audeering_theme'
+html_theme = "sphinx_audeering_theme"
 html_theme_options = {
-    'display_version': True,
-    'logo_only': False,
+    "display_version": True,
+    "logo_only": False,
 }
 html_title = title
 
 
 # -- Intersphinx ------------------------------------------------
 intersphinx_mapping = {
-    'audmetric': ('https://audeering.github.io/audmetric/', None),
-    'matplotlib': ('http://matplotlib.org/', None),
-    'numpy': ('http://docs.scipy.org/doc/numpy/', None),
-    'pandas': ('https://pandas.pydata.org/pandas-docs/stable/', None),
-    'scipy': ('http://docs.scipy.org/doc/scipy/reference/', None),
-    'seaborn': ('https://seaborn.pydata.org/', None),
-    'sklearn': ('http://scikit-learn.org/stable', None),
+    "audmetric": ("https://audeering.github.io/audmetric/", None),
+    "matplotlib": ("http://matplotlib.org/", None),
+    "numpy": ("http://docs.scipy.org/doc/numpy/", None),
+    "pandas": ("https://pandas.pydata.org/pandas-docs/stable/", None),
+    "scipy": ("http://docs.scipy.org/doc/scipy/reference/", None),
+    "seaborn": ("https://seaborn.pydata.org/", None),
+    "sklearn": ("http://scikit-learn.org/stable", None),
 }
diff --git a/docs/index.rst b/docs/index.rst
index 590af16..f3f5d58 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -1,5 +1,3 @@
-.. documentation master file
-
 .. include:: ../README.rst
 
 .. toctree::
diff --git a/docs/requirements.txt b/docs/requirements.txt
index ec0af57..f21d291 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1,3 +1,4 @@
+audeer
 ipykernel
 jupyter-sphinx
 sphinx
@@ -6,3 +7,4 @@ sphinx-autodoc-typehints
 sphinx-copybutton
 sphinxcontrib-programoutput
 sphinxcontrib-bibtex
+toml
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..ef924f4
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,212 @@
+# ===== PROJECT ===========================================================
+#
+[project]
+name = 'audpsychometric'
+authors = [
+    {name = 'Hagen Wierstorf', email = 'hwierstorf@audeering.com'},
+    {name = 'Christian Geng', email = 'cgeng@audeering.com'},
+    {name = 'Sandrine Lefort'},
+]
+description = 'Analyze and summarize human annotations'
+readme = 'README.rst'
+license = {file = 'LICENSE'}
+keywords = [
+    'audio',
+    'data',
+    'dataset',
+    'annotation',
+    'mlops',
+    'machine learning',
+]
+classifiers = [
+    'Development Status :: 5 - Production/Stable',
+    'Intended Audience :: Science/Research',
+    'Intended Audience :: Developers',
+    'License :: OSI Approved :: MIT License',
+    'Operating System :: OS Independent',
+    'Programming Language :: Python',
+    'Programming Language :: Python :: 3',
+    'Programming Language :: Python :: 3.8',
+    'Programming Language :: Python :: 3.9',
+    'Programming Language :: Python :: 3.10',
+    'Topic :: Scientific/Engineering',
+]
+dependencies = [
+    'audeer >=1.10.0',
+    'audmetric',
+    'pingouin',
+    'numpy',
+    'pandas',
+    'scikit-learn',
+    'statsmodels',
+]
+# Get version dynamically from git
+# (needs setuptools_scm tools config below)
+dynamic = ['version']
+
+[project.urls]
+repository = 'https://github.com/audeering/audpsychometric/'
+documentation = 'https://audeering.github.io/audpsychometric/'
+
+
+# ===== BUILD-SYSTEM ======================================================
+#
+# Requirements for building the Python package
+[build-system]
+requires = ['setuptools>=45', 'setuptools_scm[toml]>=6.2']
+build-backend = 'setuptools.build_meta'
+
+
+# ===== TOOL ==============================================================
+#
+
+# ----- codespell ---------------------------------------------------------
+[tool.codespell]
+builtin = 'clear,rare,informal,usage,names'
+skip = './audpsychometric.egg-info,./build,./docs/api,./docs/_templates,./docs/pics'
+ignore-words-list = 'sie,Sie,unter'
+uri-ignore-words-list = 'ist'
+
+
+# ----- pytest ------------------------------------------------------------
+#
+[tool.pytest.ini_options]
+cache_dir = '.cache/pytest'
+xfail_strict = true
+addopts = '''
+    --doctest-plus
+    --cov=audpsychometric
+    --cov-fail-under=100
+    --cov-report term-missing
+    --cov-report xml
+    --ignore=docs/
+    --ignore=benchmarks/
+'''
+
+
+# ----- ruff --------------------------------------------------------------
+#
+[tool.ruff]
+cache-dir = '.cache/ruff'
+
+[tool.ruff.format]
+docstring-code-format = true
+
+[tool.ruff.lint]
+select = [
+    'D',     # pydocstyle
+    'E',     # pycodestyle errors
+    'F',     # Pyflakes
+    'I',     # isort
+    'N',     # pep8-naming
+    'W',     # pycodestyle warnings
+]
+
+extend-ignore = [
+    'D100',  # Missing docstring in public module
+    'D101',  # Missing docstring in public class
+    'D102',  # Missing docstring in public method
+    'D103',  # Missing docstring in public function
+    'D104',  # Missing docstring in public package
+    'D107',  # Missing docstring in `__init__`
+]
+
+[tool.ruff.lint.per-file-ignores]
+'__init__.py' = [
+    'F401',  # * imported but unused
+]
+'common.py' = [
+    'D105',  # Missing docstring in magic method
+]
+
+
+# ----- I: isort -----
+#
+# Check correct order/syntax of import statements
+#
+[tool.ruff.lint.isort]
+
+# All from imports have their own line, e.g.
+#
+#   from .utils import util_a
+#   from .utils import util_b
+#
+force-single-line = true
+
+# Sort by module names
+# and not import before from, e.g.
+#
+#     from datetime import date
+#     import os
+#
+force-sort-within-sections = true
+
+# Ensure we have two empty lines
+# after last import
+lines-after-imports = 2
+
+# Group all audEERING packages into a separate section, e.g.
+#
+#     import os
+#
+#     import numpy as np
+#
+#     import audmath
+#
+section-order = [
+    'future',
+    'standard-library',
+    'third-party',
+    'audeering',
+    'first-party',
+    'local-folder',
+]
+[tool.ruff.lint.isort.sections]
+'audeering' = [
+    'audb',
+    'audbackend',
+    'audeer',
+    'audformat',
+    'audiofile',
+    'audfactory',
+    'audinterface',
+    'audmath',
+    'audmetric',
+    'audobject',
+    'audonnx',
+    'audplot',
+    'audresample',
+    'audtorch',
+    'opensmile',
+    'sphinx-audeering-theme',
+]
+
+
+# ----- N: pep8-naming -----
+#
+# Check variable/class names follow PEP8 naming convention
+#
+[tool.ruff.lint.pep8-naming]
+ignore-names = [
+    'config',  # allow lowercase class name
+    'test_*',  # allow uppercase name when testing a class
+]
+
+
+# ----- W: pycodestyle -----
+#
+# Check docstrings follow selected convention
+#
+[tool.ruff.lint.pydocstyle]
+convention = 'google'
+
+
+# ----- setuptools --------------------------------------------------------
+#
+# Find all (sub-)modules of the Python package
+[tool.setuptools.packages.find]
+
+# ----- setuptools_scm ----------------------------------------------------
+#
+# Use setuptools_scm to get version from git
+[tool.setuptools_scm]
diff --git a/setup.cfg b/setup.cfg
deleted file mode 100644
index cec5ba4..0000000
--- a/setup.cfg
+++ /dev/null
@@ -1,82 +0,0 @@
-[metadata]
-name = audpsychometric
-author = Christian Geng, Sandrine Lefort, Hagen Wierstorf
-author-email = cgeng@audeering.com, slefort@audeering.com, hwierstorf@audeering.com
-url = https://github.com/audeering/audpsychometric
-project-urls =
-    Documentation = https://audeering.github.io/audpsychometric/
-description = A Python example package
-long-description = file: README.rst, CHANGELOG.rst
-license = MIT
-license-file = LICENSE
-keywords = example
-platforms= any
-classifiers =
-    Development Status :: 3 - Alpha
-    Intended Audience :: Developers
-    Intended Audience :: Science/Research
-    License :: audEERING
-    Operating System :: OS Independent
-    Programming Language :: Python
-    Programming Language :: Python :: 3.8
-    Programming Language :: Python :: 3.9
-    Topic :: Scientific/Engineering
-
-[options]
-packages = find:
-install_requires =
-    audeer >=1.10.0
-    audmetric
-    pingouin
-    numpy
-    pandas
-    scikit-learn
-    statsmodels
-setup_requires =
-    setuptools_scm
-
-[tool:pytest]
-addopts =
-    --doctest-plus
-    --cov=audpsychometric
-    --cov-fail-under=100
-    --cov-report xml
-    --cov-report term-missing
-    --ignore=test_audeering_internal.py
-    -k "not internal"
-xfail_strict = true
-markers =
-    internal: tests that can only run audeering-internally.
-
-[flake8]
-max-line-length = 119
-ignore =
-    W503  # math, https://github.com/PyCQA/pycodestyle/issues/513
-    __init__.py F401  # ignore unused imports
-per-file-ignores =
-       build/jupyter_execute/api.py:W391
-
-
-[yapf]
-# or put into .style.yapf .style.yapf
-based_on_style = pep8
-spaces_before_comment = 4
-split_before_logical_operator = true
-column_limit = 119
-align_closing_bracket_with_visual_indent = True
-split_before_arithmetic_operator = True
-split_before_bitwise_operator = True
-blank_line_before_module_docstring = True
-split_before_dot = True
-newline_after_paren = True
-
-[isort]
-force_single_line=True
-multi_line_output=3
-include_trailing_comma=True
-force_grid_wrap=0
-use_parentheses=True
-line_length=119
-known_first_party=aud*
-force_alphabetical_sort_within_sections=True
-# isort docs at https://pycqa.github.io/isort/index.html
diff --git a/setup.py b/setup.py
deleted file mode 100644
index d5d43d7..0000000
--- a/setup.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from setuptools import setup
-
-setup(use_scm_version=True)
diff --git a/tests/conftest.py b/tests/conftest.py
index 91227b0..b83ce14 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,5 +1,6 @@
-import pytest
 import numpy as np
+import pytest
+
 from audpsychometric import datasets
 
 
diff --git a/tests/test_audeering_internal.py b/tests/test_audeering_internal.py
deleted file mode 100644
index adfe63d..0000000
--- a/tests/test_audeering_internal.py
+++ /dev/null
@@ -1,216 +0,0 @@
-"""Tests that can only be run audeering-internally
-
-These are marked by a pytest mark and can be selected with.
-
-    python -m pytest -k "internal"
-
-"""
-
-import os
-
-import numpy as np
-import pandas as pd
-import pingouin as pg
-import pytest
-
-import audmetric
-import audpsychometric
-
-pd.options.display.width = 0
-N_ANSWERS_THRESH = 25
-pytestmark = pytest.mark.internal
-
-
-def generate_coreset_df(coreset_name="set133"):
-    """Create the coreset df using audb
-
-    This coreset is prior to any mappint to n classes
-
-    Args:
-        coreset_name: The corset nets to be known
-
-    Returns:
-        table containing the ratings in wide format
-
-    """
-    import audb
-
-    database_name = "projectsmile-salamander-agent-tone"
-    db = audb.load(database_name, only_metadata=True)
-
-    df = pd.concat(
-        [
-            db["agent-tone.train"].get().astype("float"),
-            db["agent-tone.dev"].get().astype("float"),
-            db["agent-tone.test"].get().astype("float"),
-            db["agent-tone.test2"].get().astype("float"),
-        ]
-    )
-
-    df_sets = db["sets"].get()
-    df_core_set = df_sets[df_sets.set == coreset_name]
-
-    print(f"df shapes {df.shape}")
-    print(f"Core set shape {df_core_set.shape}")
-
-    def f(x):
-        return x in list(df_core_set.index)  # noqa: E731
-
-    select = pd.Series(df.index.get_level_values("file").values).apply(f)
-    df = df[(select.values)]
-    print(df.shape)
-
-    n_raters = len(df.columns)
-    nobs = n_raters - df.isnull().sum(axis=1)
-    print(f"Determine total number of raters: {n_raters}")
-    print("Distribution of number of ratings per Item:")
-    print(nobs.value_counts())
-    df_reliability = df[nobs > N_ANSWERS_THRESH]
-
-    return df_reliability
-
-
-@pytest.fixture(scope="module", autouse=True)
-def coreset_df() -> pd.DataFrame:
-    """
-
-    Args:
-        None
-    Returns:
-        pd.DataFrame
-
-    generate coreset dataframe for the projectsmile salamander agent tone
-
-    Initially, two coresets were considered candidates, set142
-    and set133. Set 133 proved the right one.
-
-    Unlike the Salamander_Feedback_Agent.ipynb notebook,
-    these parts of the data are not considered because these
-    overlap with the ones that are only called dev and test:
-
-    - df_unbalanced_test
-    - df_unbalanced_dev
-
-    Instead, the data in agent-tone.test2 are added
-
-    The dataset is returned as is and not reshaped to wide.
-
-    """
-
-    dataset_name = "coreset_133"
-    dataset_path = os.path.join(audpsychometric.datasets.data_directory, f"{dataset_name}.csv")
-
-    if os.path.exists(dataset_path):
-        print("reading from disk")
-        df = pd.read_csv(dataset_path)
-    else:
-        print("generating")
-        df = generate_coreset_df()
-        df.to_csv(dataset_path, index=False)
-
-    return df
-
-
-def test_audeering_icc(coreset_df):
-    """Test the coreset results"""
-    # impute Flag
-    impute = False
-    if impute:
-        # f = lambda x: x.fillna(x.mean())
-        # f = lambda x: x.fillna(x.mode())
-        # coreset_df = coreset_df.apply(f, axis=1)
-        coreset_df = coreset_df.apply(lambda x: x.fillna(x.mean()), axis=1)
-        # coreset_df = coreset_df.apply(lambda x: x.fillna(x.mode()),axis=1)
-
-    n_nan = coreset_df.isna().sum().sum()
-    print(f"currently {n_nan} nan values")
-    n_raters_tot = coreset_df.shape[1]
-
-    # convert to LONG format:
-    df_long = coreset_df.melt(ignore_index=False)
-    df_long["item"] = df_long.index
-    df_long.columns = ["rater", "rating", "item"]
-    n_raters_tot = coreset_df.shape[1]
-
-    # convert back to wide format:
-    data = df_long.pivot_table(index="item", columns="rater", values="rating")
-
-    n_raters_trimmed = data.shape[1]
-    print(f"N Raters before/after trimming: {n_raters_tot}/ {n_raters_trimmed}")
-
-    icc_p = pg.intraclass_corr(
-        data=df_long,
-        targets="item",
-        raters="rater",
-        ratings="rating",
-        nan_policy="omit",
-    )
-
-    print(icc_p)
-    icc_1 = icc_p.loc[icc_p["Type"] == "ICC1"].iloc[0]
-
-    assert np.isclose(icc_1["ICC"], 0.2180, atol=1e-4)
-    assert np.isclose(icc_1["F"], 8.8059, atol=1e-4)
-
-    # Expected Results:
-    # icc_p
-    #     Type              Description       ICC          F  df1   df2           pval         CI95%
-    # 0   ICC1   Single raters absolute  0.218007   8.805960  183  4968  3.238633e-190  [0.18, 0.26]
-    # 1   ICC2     Single random raters  0.226355  14.350342  183  4941   0.000000e+00  [0.18, 0.28]
-    # 2   ICC3      Single fixed raters  0.322859  14.350342  183  4941   0.000000e+00  [0.28, 0.38]
-    # 3  ICC1k  Average raters absolute  0.886441   8.805960  183  4968  3.238633e-190  [0.86, 0.91]
-    # 4  ICC2k    Average random raters  0.891214  14.350342  183  4941   0.000000e+00  [0.86, 0.92]
-    # 5  ICC3k     Average fixed raters  0.930315  14.350342  183  4941   0.000000e+00  [0.91, 0.94]
-
-    icc_aa, results = audpsychometric.intra_class_correlation(coreset_df)
-    # first value test
-    assert np.isclose(icc_aa, 0.2180, atol=1e-4)
-    assert len(results) == 3
-    # return the right keys
-    assert all(
-        [
-            a == b
-            for a, b in zip(
-                results.keys(), ["icc_dict", "results_table", "anova_table"]
-            )
-        ]
-    )
-
-
-def test_audeering_goldstandard_mean(coreset_df):
-    r"""coreset: happy flow for gold standard mean"""
-    df = audpsychometric.gold_standard_mean(coreset_df)
-    assert isinstance(df, pd.DataFrame)
-    assert "gold_standard" in df.columns
-    assert "confidence" in df.columns
-    assert np.alltrue((df["confidence"] >= 0.0) & (df["confidence"] <= 1.0).values)
-
-
-def test_audeering_goldstandard_median(coreset_df):
-    r"""coreset: happy flow for gold standard median"""
-    df = audpsychometric.gold_standard_median(coreset_df)
-    assert isinstance(df, pd.DataFrame)
-    assert "gold_standard" in df.columns
-    assert "confidence" in df.columns
-    assert np.alltrue((df["confidence"] >= 0.0) & (df["confidence"] <= 1.0).values)
-
-
-def test_audeering_goldstandard_mode(coreset_df):
-    r"""coreset: happy flow for mode"""
-    df = audpsychometric.gold_standard_mode(coreset_df)
-    assert isinstance(df, pd.DataFrame)
-    assert "gold_standard" in df.columns
-    assert "confidence" in df.columns
-    assert np.alltrue((df["confidence"] >= 0.0) & (df["confidence"] <= 1.0).values)
-
-
-def test_confidence_values(coreset_df):
-    r"""check that confidences correlate"""
-
-    df_ewe = audpsychometric.evaluator_weighted_estimator(coreset_df)
-    df_mode = audpsychometric.gold_standard_mode(coreset_df)
-
-    df_corr = pd.concat([df_mode["confidence"], df_ewe["confidence"]], axis=1)
-    corr = audmetric.pearson_cc(df_mode["confidence"], df_ewe["confidence"])
-    assert corr > 0.5
-    assert df_corr.corr().iloc[0, 1] > 0.5
diff --git a/tests/test_dataset.py b/tests/test_dataset.py
index 73bcfa1..744a51a 100644
--- a/tests/test_dataset.py
+++ b/tests/test_dataset.py
@@ -1,4 +1,4 @@
-"""Test Dataset Module"""
+"""Test Dataset Module."""
 import pandas as pd
 import pytest
 
@@ -6,21 +6,23 @@
 
 
 def test_list_datasets():
-    """first basic dataset is available in dataset list"""
+    """First basic dataset is available in dataset list."""
     df_datasets = audpsychometric.datasets.list_datasets()
-    assert 'statology' in df_datasets.index
+    assert "statology" in df_datasets.index
 
 
-@pytest.mark.parametrize('dataset', [
-    'statology',
-    'hallgren-table5',
-    'hallgren-table3',
-    'HolzingerSwineford1939',
-    'Shrout_Fleiss',
-    'wine'
-])
+@pytest.mark.parametrize(
+    "dataset",
+    [
+        "statology",
+        "hallgren-table5",
+        "hallgren-table3",
+        "HolzingerSwineford1939",
+        "Shrout_Fleiss",
+        "wine",
+    ],
+)
 def test_read_dataset(dataset):
-    """test functional requirement that a dataset can be read into dataframe"""
-
+    """Test functional requirement that a dataset can be read into dataframe."""
     df_dataset = audpsychometric.datasets.read_dataset(dataset)
     assert isinstance(df_dataset, pd.DataFrame)
diff --git a/tests/test_gold_standard.py b/tests/test_gold_standard.py
index b425ef5..0714bcf 100644
--- a/tests/test_gold_standard.py
+++ b/tests/test_gold_standard.py
@@ -1,7 +1,6 @@
-"""Tests of the module calculating gold standard and item confidence
-
-Usage Example(s):
+"""Tests of the module calculating gold standard and item confidence.
 
+Examples:
     pytest tests/test_goldstandard.py -k test_evaluator_weighted_estimator
 
 """
@@ -17,7 +16,7 @@
 
 
 def test_rater_confidence_pearson(df_holzinger_swineford):
-    """Happy Flow test for mode for rater based consistency"""
+    """Happy Flow test for mode for rater based consistency."""
     result = audpsychometric.rater_confidence_pearson(df_holzinger_swineford)
     result_values = np.array([x for x in result.values()])
     # there is a very unrealible rater in this set with .24
@@ -25,14 +24,13 @@ def test_rater_confidence_pearson(df_holzinger_swineford):
 
 
 def test_mode_based_gold_standard():
-    """Happy Flow test for mode based gold standard"""
-
+    """Happy Flow test for mode based gold standard."""
     df = pd.DataFrame([[4, 9, np.nan]] * 3, columns=["A", "B", "C"])
     df = audpsychometric.gold_standard_mode(df)
     assert isinstance(df, pd.DataFrame)
     assert "gold_standard" in df.columns
     assert "confidence" in df.columns
-    assert np.all((df['confidence'] >= 0.) & (df['confidence'] <= 1.).values)
+    assert np.all((df["confidence"] >= 0.0) & (df["confidence"] <= 1.0).values)
 
 
 # The expected confidence value for this test
@@ -45,7 +43,7 @@ def test_mode_based_gold_standard():
 #     return 1 - std_norm
 #
 @pytest.mark.parametrize(
-    'df, minimum, maximum, axis, df_expected',
+    "df, minimum, maximum, axis, df_expected",
     [
         (
             pd.DataFrame([0]),
@@ -53,8 +51,8 @@ def test_mode_based_gold_standard():
             1,
             1,
             pd.DataFrame(
-                [[0., 1.]],
-                columns=['gold_standard', 'confidence'],
+                [[0.0, 1.0]],
+                columns=["gold_standard", "confidence"],
             ),
         ),
         (
@@ -63,8 +61,8 @@ def test_mode_based_gold_standard():
             1,
             1,
             pd.DataFrame(
-                [[0., 1.]],
-                columns=['gold_standard', 'confidence'],
+                [[0.0, 1.0]],
+                columns=["gold_standard", "confidence"],
             ),
         ),
         (
@@ -73,68 +71,68 @@ def test_mode_based_gold_standard():
             1,
             1,
             pd.DataFrame(
-                [[1., 1.]],
-                columns=['gold_standard', 'confidence'],
+                [[1.0, 1.0]],
+                columns=["gold_standard", "confidence"],
             ),
         ),
         (
-            pd.DataFrame([[.3, .3, .3]]),
+            pd.DataFrame([[0.3, 0.3, 0.3]]),
             0,
             1,
             1,
             pd.DataFrame(
-                [[.3, 1.]],
-                columns=['gold_standard', 'confidence'],
+                [[0.3, 1.0]],
+                columns=["gold_standard", "confidence"],
             ),
         ),
         (
-            pd.DataFrame([[0, 0, .1, .2]]),
+            pd.DataFrame([[0, 0, 0.1, 0.2]]),
             0,
             1,
             1,
             pd.DataFrame(
-                [[.075, 0.83416876048223]],
-                columns=['gold_standard', 'confidence'],
+                [[0.075, 0.83416876048223]],
+                columns=["gold_standard", "confidence"],
             ),
         ),
         (
-            pd.DataFrame([[0, 0, .2, .4]]),
+            pd.DataFrame([[0, 0, 0.2, 0.4]]),
             0,
             1,
             1,
             pd.DataFrame(
-                [[.150, .66833752096446]],
-                columns=['gold_standard', 'confidence'],
+                [[0.150, 0.66833752096446]],
+                columns=["gold_standard", "confidence"],
             ),
         ),
         (
-            pd.DataFrame([[0, 0, 0, 0, .2, .2, .4, .4]]),
+            pd.DataFrame([[0, 0, 0, 0, 0.2, 0.2, 0.4, 0.4]]),
             0,
             1,
             1,
             pd.DataFrame(
-                [[.150, .66833752096446]],
-                columns=['gold_standard', 'confidence'],
+                [[0.150, 0.66833752096446]],
+                columns=["gold_standard", "confidence"],
             ),
         ),
         (
-            pd.DataFrame([[0, .4, .6, 1]]),
+            pd.DataFrame([[0, 0.4, 0.6, 1]]),
             0,
             1,
             1,
             pd.DataFrame(
-                [[.5, .2788897449072021]],
-                columns=['gold_standard', 'confidence'],
+                [[0.5, 0.2788897449072021]],
+                columns=["gold_standard", "confidence"],
             ),
         ),
         (
-            pd.DataFrame([[0, .33, .67, 1]]),
+            pd.DataFrame([[0, 0.33, 0.67, 1]]),
             0,
             1,
             1,
             pd.DataFrame(
-                [[.5, .2531399060064863]],
-                columns=['gold_standard', 'confidence'],
+                [[0.5, 0.2531399060064863]],
+                columns=["gold_standard", "confidence"],
             ),
         ),
         (
@@ -143,8 +141,8 @@ def test_mode_based_gold_standard():
             1,
             1,
             pd.DataFrame(
-                [[.5, 0.]],
-                columns=['gold_standard', 'confidence'],
+                [[0.5, 0.0]],
+                columns=["gold_standard", "confidence"],
             ),
         ),
         (
@@ -153,8 +151,8 @@ def test_mode_based_gold_standard():
             1,
             1,
             pd.DataFrame(
-                [[.5, 0.]],
-                columns=['gold_standard', 'confidence'],
+                [[0.5, 0.0]],
+                columns=["gold_standard", "confidence"],
             ),
         ),
         (
@@ -169,11 +167,11 @@ def test_mode_based_gold_standard():
             0,
             pd.DataFrame(
                 [
-                    [2., .8],
-                    [3., .8],
-                    [4., .8],
+                    [2.0, 0.8],
+                    [3.0, 0.8],
+                    [4.0, 0.8],
                 ],
-                columns=['gold_standard', 'confidence'],
+                columns=["gold_standard", "confidence"],
             ),
         ),
         (
@@ -188,16 +186,16 @@ def test_mode_based_gold_standard():
             1,
             pd.DataFrame(
                 [
-                    [2., .8367006838144548],
-                    [4., .8367006838144548],
+                    [2.0, 0.8367006838144548],
+                    [4.0, 0.8367006838144548],
                 ],
-                columns=['gold_standard', 'confidence'],
+                columns=["gold_standard", "confidence"],
             ),
         ),
-    ]
+    ],
 )
 def test_mean_based_gold_standard(df, minimum, maximum, axis, df_expected):
-    """Happy Flow test for mode based gold standard"""
+    """Happy Flow test for mode based gold standard."""
     pd.testing.assert_frame_equal(
         audpsychometric.gold_standard_mean(df, minimum, maximum, axis=axis),
         df_expected,
@@ -214,7 +212,7 @@ def test_mean_based_gold_standard(df, minimum, maximum, axis, df_expected):
 #     return 1 - std_norm
 #
 @pytest.mark.parametrize(
-    'df, minimum, maximum, axis, df_expected',
+    "df, minimum, maximum, axis, df_expected",
     [
         (
             pd.DataFrame([0]),
@@ -222,8 +220,8 @@ def test_mean_based_gold_standard(df, minimum, maximum, axis, df_expected):
             1,
             1,
             pd.DataFrame(
-                [[0., 1.]],
-                columns=['gold_standard', 'confidence'],
+                [[0.0, 1.0]],
+                columns=["gold_standard", "confidence"],
             ),
         ),
         (
@@ -232,8 +230,8 @@ def test_mean_based_gold_standard(df, minimum, maximum, axis, df_expected):
             1,
             1,
             pd.DataFrame(
-                [[0., 1.]],
-                columns=['gold_standard', 'confidence'],
+                [[0.0, 1.0]],
+                columns=["gold_standard", "confidence"],
             ),
         ),
         (
@@ -242,68 +240,68 @@ def test_mean_based_gold_standard(df, minimum, maximum, axis, df_expected):
             1,
             1,
             pd.DataFrame(
-                [[1., 1.]],
-                columns=['gold_standard', 'confidence'],
+                [[1.0, 1.0]],
+                columns=["gold_standard", "confidence"],
             ),
         ),
         (
-            pd.DataFrame([[.3, .3, .3]]),
+            pd.DataFrame([[0.3, 0.3, 0.3]]),
             0,
             1,
             1,
             pd.DataFrame(
-                [[.3, 1.]],
-                columns=['gold_standard', 'confidence'],
+                [[0.3, 1.0]],
+                columns=["gold_standard", "confidence"],
             ),
         ),
         (
-            pd.DataFrame([[0, 0, .1, .2]]),
+            pd.DataFrame([[0, 0, 0.1, 0.2]]),
             0,
             1,
             1,
             pd.DataFrame(
-                [[.05, 0.83416876048223]],
-                columns=['gold_standard', 'confidence'],
+                [[0.05, 0.83416876048223]],
+                columns=["gold_standard", "confidence"],
             ),
         ),
         (
-            pd.DataFrame([[0, 0, .2, .4]]),
+            pd.DataFrame([[0, 0, 0.2, 0.4]]),
             0,
             1,
             1,
             pd.DataFrame(
-                [[.1, .66833752096446]],
-                columns=['gold_standard', 'confidence'],
+                [[0.1, 0.66833752096446]],
+                columns=["gold_standard", "confidence"],
             ),
         ),
         (
-            pd.DataFrame([[0, 0, 0, 0, .2, .2, .4, .4]]),
+            pd.DataFrame([[0, 0, 0, 0, 0.2, 0.2, 0.4, 0.4]]),
             0,
             1,
             1,
             pd.DataFrame(
-                [[.1, .66833752096446]],
-                columns=['gold_standard', 'confidence'],
+                [[0.1, 0.66833752096446]],
+                columns=["gold_standard", "confidence"],
             ),
         ),
         (
-            pd.DataFrame([[0, .4, .6, 1]]),
+            pd.DataFrame([[0, 0.4, 0.6, 1]]),
             0,
             1,
             1,
             pd.DataFrame(
-                [[.5, .2788897449072021]],
-                columns=['gold_standard', 'confidence'],
+                [[0.5, 0.2788897449072021]],
+                columns=["gold_standard", "confidence"],
             ),
         ),
         (
-            pd.DataFrame([[0, .33, .67, 1]]),
+            pd.DataFrame([[0, 0.33, 0.67, 1]]),
             0,
             1,
             1,
             pd.DataFrame(
-                [[.5, .2531399060064863]],
-                columns=['gold_standard', 'confidence'],
+                [[0.5, 0.2531399060064863]],
+                columns=["gold_standard", "confidence"],
             ),
         ),
         (
@@ -312,8 +310,8 @@ def test_mean_based_gold_standard(df, minimum, maximum, axis, df_expected):
             1,
             1,
             pd.DataFrame(
-                [[.5, 0.]],
-                columns=['gold_standard', 'confidence'],
+                [[0.5, 0.0]],
+                columns=["gold_standard", "confidence"],
             ),
         ),
         (
@@ -322,8 +320,8 @@ def test_mean_based_gold_standard(df, minimum, maximum, axis, df_expected):
             1,
             1,
             pd.DataFrame(
-                [[.5, 0.]],
-                columns=['gold_standard', 'confidence'],
+                [[0.5, 0.0]],
+                columns=["gold_standard", "confidence"],
             ),
         ),
         (
@@ -338,11 +336,11 @@ def test_mean_based_gold_standard(df, minimum, maximum, axis, df_expected):
             0,
             pd.DataFrame(
                 [
-                    [2., .8],
-                    [3., .8],
-                    [4., .8],
+                    [2.0, 0.8],
+                    [3.0, 0.8],
+                    [4.0, 0.8],
                 ],
-                columns=['gold_standard', 'confidence'],
+                columns=["gold_standard", "confidence"],
             ),
         ),
         (
@@ -357,25 +355,25 @@ def test_mean_based_gold_standard(df, minimum, maximum, axis, df_expected):
             1,
             pd.DataFrame(
                 [
-                    [2., .8367006838144548],
-                    [4., .8367006838144548],
+                    [2.0, 0.8367006838144548],
+                    [4.0, 0.8367006838144548],
                 ],
-                columns=['gold_standard', 'confidence'],
+                columns=["gold_standard", "confidence"],
             ),
         ),
-    ]
+    ],
 )
 def test_median_based_gold_standard(df, minimum, maximum, axis, df_expected):
-    """Test that  median gold standard returns df"""
+    """Test that median gold standard returns df."""
     pd.testing.assert_frame_equal(
         audpsychometric.gold_standard_median(df, minimum, maximum, axis=axis),
         df_expected,
     )
 
 
-@pytest.mark.parametrize('axis', [0, 1])
+@pytest.mark.parametrize("axis", [0, 1])
 def test_evaluator_weighted_estimator(df_holzinger_swineford, axis):
-    """Happy Flow test for mode based gold standard"""
+    """Happy Flow test for mode based gold standard."""
     if axis == 0:
         df_holzinger_swineford = df_holzinger_swineford.T
 
@@ -409,5 +407,5 @@ def test_evaluator_weighted_estimator(df_holzinger_swineford, axis):
 
 @pytest.mark.xfail(raises=ValueError)
 def test_f_categorical(df_holzinger_swineford):
-    """test that functional raises when no gold column"""
+    """Test that functional raises when no gold column."""
     _ = _confidence_categorical(df_holzinger_swineford)
diff --git a/tests/test_reliability.py b/tests/test_reliability.py
index df53ed5..e3d075d 100644
--- a/tests/test_reliability.py
+++ b/tests/test_reliability.py
@@ -6,22 +6,21 @@
 
 
 def test_icc():
-    """test icc basic result validity"""
+    """Test icc basic result validity."""
     df_dataset = audpsychometric.datasets.read_dataset("wine")
 
-    data_wide = df_dataset.pivot_table(index="Wine",
-                                       columns="Judge",
-                                       values="Scores")
+    data_wide = df_dataset.pivot_table(index="Wine", columns="Judge", values="Scores")
 
-    icc_sm, _ = audpsychometric.intra_class_correlation(data_wide, anova_method='statsmodels')
+    icc_sm, _ = audpsychometric.intra_class_correlation(
+        data_wide, anova_method="statsmodels"
+    )
     icc_pingouin, _ = audpsychometric.intra_class_correlation(data_wide)
     assert np.isclose(icc_pingouin, 0.727, atol=1e-3)
     assert np.isclose(icc_sm, icc_pingouin, atol=1e-10)
 
 
 def test_cronbachs_alpha():
-    """test cronbach's alpha return values for three raters."""
-
+    """Test cronbach's alpha return values for three raters."""
     df_dataset = audpsychometric.datasets.read_dataset("hallgren-table3")
     df = df_dataset[["Dep_Rater1", "Dep_Rater2", "Dep_Rater3"]]
     alpha, result = audpsychometric.cronbachs_alpha(df)
@@ -30,22 +29,21 @@ def test_cronbachs_alpha():
 
 
 def test_congeneric_reliability(df_holzinger_swineford):
-    """test congeneric reliability"""
-
+    """Test congeneric reliability."""
     coefficient, result = audpsychometric.congeneric_reliability(df_holzinger_swineford)
     assert np.isclose(coefficient, 0.9365, atol=1e-4)
-    assert np.isclose(result['var. explained'][0], 0.3713, atol=1e-4)
+    assert np.isclose(result["var. explained"][0], 0.3713, atol=1e-4)
 
 
 @pytest.mark.xfail(raises=ValueError)
 def test_anova_helper():
-    """test that unknow anova parametrization raises exception"""
-    audpsychometric.intra_class_correlation(pd.DataFrame(), anova_method='bbbb')
+    """Test that unknown anova parametrization raises exception."""
+    audpsychometric.intra_class_correlation(pd.DataFrame(), anova_method="bbbb")
 
 
 def test_icc_nanremoval():
-    """cover nan removal if statement"""
+    """Cover nan removal if statement."""
     df_dataset = audpsychometric.datasets.read_dataset("HolzingerSwineford1939")
-    df_dataset = df_dataset[[x for x in df_dataset.columns if x.startswith('x')]]
+    df_dataset = df_dataset[[x for x in df_dataset.columns if x.startswith("x")]]
     nan_mat = np.random.random(df_dataset.shape) < 0.1
     audpsychometric.intra_class_correlation(df_dataset.mask(nan_mat))