From d4e78098f040bbcdb42bde7b3e586b4b908d8667 Mon Sep 17 00:00:00 2001
From: FBurkhardt <fburkhardt@audeering.com>
Date: Thu, 25 Jan 2024 14:36:07 +0100
Subject: [PATCH 01/10] initial

---
 1.3.0/create.py        | 44 ++++++++++++++++++++++++++++++++++++++++++
 1.3.0/publish.py       | 19 ++++++++++++++++++
 1.3.0/requirements.txt |  8 ++++++++
 3 files changed, 71 insertions(+)
 create mode 100644 1.3.0/create.py
 create mode 100644 1.3.0/publish.py
 create mode 100644 1.3.0/requirements.txt

diff --git a/1.3.0/create.py b/1.3.0/create.py
new file mode 100644
index 0000000..1d538b1
--- /dev/null
+++ b/1.3.0/create.py
@@ -0,0 +1,44 @@
+"""
+create.py
+
+add new age tables to the databases
+
+* per speaker use 10-20 random sentences of about 2-10 seconds
+* split into train test dev, should be age/gender balanced
+* make two versions: one with emotion acted samples and one with neutral
+* so all in all 6 new tables
+
+"""
+
+import os
+import pandas as pd
+import audb
+import audeer
+import audformat
+
+
+def main():
+    name = "crema-d"
+    previous_version = "1.2.0"
+
+    build_dir = "../build"
+    build_dir = audeer.mkdir(build_dir)
+
+    audb.load_to(
+        build_dir,
+        name,
+        version=previous_version,
+        num_workers=8,
+        only_metadata=True,
+        verbose=True,
+    )
+    db = audformat.Database.load(build_dir)
+
+    splits = ["train", "dev", "test"]
+    for split in splits:
+        pass
+    db.save(build_dir)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/1.3.0/publish.py b/1.3.0/publish.py
new file mode 100644
index 0000000..ecc8b9f
--- /dev/null
+++ b/1.3.0/publish.py
@@ -0,0 +1,19 @@
+import audb
+
+previous_version = "1.2.0"
+version = "1.3.0"
+build_dir = "./build"
+
+repository = audb.Repository(
+    name="data-public-local",
+    host="https://artifactory.audeering.com/artifactory",
+    backend="artifactory",
+)
+audb.publish(
+    build_dir,
+    version=version,
+    previous_version=previous_version,
+    repository=repository,
+    num_workers=1,
+    verbose=True,
+)
diff --git a/1.3.0/requirements.txt b/1.3.0/requirements.txt
new file mode 100644
index 0000000..69203a9
--- /dev/null
+++ b/1.3.0/requirements.txt
@@ -0,0 +1,8 @@
+pandas
+matplotlib
+seaborn
+jupyter 
+audb
+audeer
+audformat
+

From 58667d764f7b6be3f20136392287f767ca90d0f1 Mon Sep 17 00:00:00 2001
From: FBurkhardt <fburkhardt@audeering.com>
Date: Tue, 19 Mar 2024 14:07:00 +0100
Subject: [PATCH 02/10] update

---
 1.3.0/create.py            |  69 +++--
 1.3.0/publish.py           |  11 +-
 1.3.0/split_utils.py       | 514 +++++++++++++++++++++++++++++++++++++
 1.3.0/trainDevTestSplit.py |  52 ++++
 1.3.0/util.py              |  63 +++++
 5 files changed, 689 insertions(+), 20 deletions(-)
 create mode 100644 1.3.0/split_utils.py
 create mode 100644 1.3.0/trainDevTestSplit.py
 create mode 100644 1.3.0/util.py

diff --git a/1.3.0/create.py b/1.3.0/create.py
index 1d538b1..63313ce 100644
--- a/1.3.0/create.py
+++ b/1.3.0/create.py
@@ -1,20 +1,18 @@
-"""
-create.py
-
-add new age tables to the databases
-
-* per speaker use 10-20 random sentences of about 2-10 seconds
-* split into train test dev, should be age/gender balanced
-* make two versions: one with emotion acted samples and one with neutral
-* so all in all 6 new tables
-
-"""
-
 import os
+import random
 import pandas as pd
+
 import audb
 import audeer
 import audformat
+import audiofile
+import util
+import trainDevTestSplit
+
+# make it reproducible
+random.seed(23)
+
+image_dir = "images/"
 
 
 def main():
@@ -34,9 +32,50 @@ def main():
     )
     db = audformat.Database.load(build_dir)
 
-    splits = ["train", "dev", "test"]
-    for split in splits:
-        pass
+    # get age, gender and emotion info
+    df = db["files"].get()
+    df_emo = pd.concat(
+        [
+            db["emotion.categories.train"].get(),
+            db["emotion.categories.dev"].get(),
+            db["emotion.categories.test"].get(),
+        ]
+    )
+    df["age"] = db["files"]["speaker"].get(map="age").astype("int")
+    df["gender"] = db["files"]["speaker"].get(map="sex")
+    #    df["duration"] = df.index.to_series().map(lambda x: audiofile.duration(x))
+    df = df[df["gender"] != "other"]
+    df = df[df["corrupted"] != True]
+    audeer.mkdir(image_dir)
+    util.describe_df(df, f"{image_dir}all.png")
+    df["emotion"] = df_emo["emotion.0"].values
+    df = df[df.emotion.isin(["neutral"])]
+    util.describe_df(df, f"{image_dir}all_neutral.png")
+    df = util.limit_speakers(df)
+    util.describe_df(df, f"{image_dir}all_limited.png")
+
+    # create split sets
+    splits = {}
+    df_train, df_dev, df_test = trainDevTestSplit.split_df(df)
+    splits["train"] = df_train
+    splits["dev"] = df_dev
+    splits["test"] = df_test
+    # plot distributions
+    for split in ["train", "dev", "test"]:
+        print(f"split: {split}")
+        util.describe_df(splits[split], f"{image_dir}{split}.png")
+
+    # fill the database with new tables
+    age_tables_name = "age."
+    for split in splits.keys():
+        db[f"{age_tables_name}{split}"] = audformat.Table(
+            splits[split].index,
+            description=f"Table selected for age and binary gender balance from the emotionally neutral samples, max 20 samples per speaker.",
+        )
+        for field in ["speaker"]:
+            db[f"{age_tables_name}{split}"][field] = audformat.Column(scheme_id=field)
+            db[f"{age_tables_name}{split}"][field].set(splits[split][field])
+
     db.save(build_dir)
 
 
diff --git a/1.3.0/publish.py b/1.3.0/publish.py
index ecc8b9f..c585f30 100644
--- a/1.3.0/publish.py
+++ b/1.3.0/publish.py
@@ -1,14 +1,15 @@
 import audb
 
-previous_version = "1.2.0"
-version = "1.3.0"
-build_dir = "./build"
+previous_version = '1.2.0'
+version = '1.3.0'
+build_dir = '../build'
 
 repository = audb.Repository(
-    name="data-public-local",
-    host="https://artifactory.audeering.com/artifactory",
+    name="data-public",
+    host="https://audeering.jfrog.io/artifactory",
     backend="artifactory",
 )
+
 audb.publish(
     build_dir,
     version=version,
diff --git a/1.3.0/split_utils.py b/1.3.0/split_utils.py
new file mode 100644
index 0000000..86f582a
--- /dev/null
+++ b/1.3.0/split_utils.py
@@ -0,0 +1,514 @@
+from collections import Counter
+import numpy as np
+import pandas as pd
+import scipy.spatial as ssp
+from sklearn.model_selection import GroupShuffleSplit
+import sys
+
+
+def optimize_traindevtest_split(X, y, split_on, stratify_on, weight=None, dev_size=.1,
+                                test_size=.1, k=30, seed=42):
+
+    ''' optimize group-disjunct split into training, dev, and test set, which is guided by:
+    - disjunct split of values in SPLIT_ON
+    - stratification by all keys in STRATIFY_ON (targets and groupings)
+    - test set proportion in X should be close to test_size (which is the test
+      proportion in set(split_on))
+
+    Score to be minimized: (sum_v[w(v) * max_irad(v)] + w(d) * max_d) / (sum_v[w(v)] + w(d))
+    (v: variables to be stratified on
+    w(v): their weight
+    max_irad(v): maximum information radius of reference distribution of classes in v and
+                 - dev set distribution,
+                 - test set distribution
+    N(v): number of stratification variables
+    max_d: maximum of absolute difference between dev and test sizes of X and set(split_on)
+    w(d): its weight
+
+    Args:
+    X: (pd.DataFrame) of features/groupings for which best split
+      is to be calculated. Of shape (N, M)
+    y: (np.array) of targets of length N
+      if type(y[0]) in ["str", "int"]: y is assumed to be categorical, so that it is additionally
+      tested that all partitions cover all classes. Else y is assumed to be numeric and no
+      coverage test is done.
+    split_on: (np.array) list of length N with grouping variable (e.g. speaker IDs),
+      on which the group-disjunct split is to be performed. Must be categorical.
+    stratify_on: (dict) Dict-keys are variable names (targets and/or further groupings)
+      the split should be stratified on (groupings could e.g. be sex, age class, etc).
+      Dict-Values are np.array-s of length N that contain the variable values. All
+      variables must be categorical.
+    weight: (dict) weight for each variable in stratify_on. Defines their amount of
+      contribution to the optimization score. Uniform weighting by default. Additional
+      key: "size_diff" defines how the corresponding size differences should be weighted.
+    dev_size: (float) proportion in set(split_on) for dev set, e.g. 10% of speakers
+      to be held-out
+    test_size: (float) test proportion in set(split_on) for test set
+    k: (int) number of different splits to be tried out
+    seed: (int) random seed
+    Returns:
+    train_i: (np.array) train set indices in X
+    dev_i: (np.array) dev set indices in X
+    test_i: (np.array) test set indices in X
+    info: (dict) detail information about reference and achieved prob distributions
+        "dev_size_in_spliton": intended grouping dev_size
+        "dev_size_in_X": optimized dev proportion of observations in X
+        "test_size_in_spliton": intended grouping test_size
+        "test_size_in_X": optimized test proportion of observations in X
+        "p_ref_{c}": reference class distribution calculated from stratify_on[c]
+        "p_dev_{c}": dev set class distribution calculated from stratify_on[c][dev_i]
+        "p_test_{c}": test set class distribution calculated from stratify_on[c][test_i]
+    '''
+    
+    # data size
+    N = len(y)
+
+    # categorical target: number of classes for coverage test
+    if is_categorical(y[0]):
+        nc = len(set(y))
+    else:
+        nc = None
+        
+    # adjusted dev_size after having split off the test set
+    dev_size_adj = (dev_size * N) / (N - test_size * N)
+
+    # split all into train/dev vs test
+    gss_o = GroupShuffleSplit(n_splits=k, test_size=test_size,
+                              random_state=seed)
+
+    # split train/dev into train vs dev
+    gss_i = GroupShuffleSplit(n_splits=k, test_size=dev_size_adj,
+                              random_state=seed)
+    
+    # set weight defaults
+    if weight is None:
+        weight = {}
+    for c in stratify_on.keys():
+        if c not in weight:
+            weight[c] = 1
+    if "size_diff" not in weight:
+        weight["size_diff"] = 1
+        
+    # stratification reference distributions calculated on stratify_on
+    p_ref = {}
+    for c in stratify_on:
+        p_ref[c] = class_prob(stratify_on[c])
+        
+    # best train/dev/test indices in X; best associated score
+    train_i, dev_i, test_i, best_sco = None, None, None, np.inf
+
+    # full target coverage in all partitions
+    full_target_coverage = False
+
+    # brute-force optimization of SPLIT_ON split
+    #    outer loop *_o: splitting into train/dev and test
+    #    inner loop *_i: spltting into train and dev
+    for tri_o, tei_o in gss_o.split(X, y, split_on):
+        
+        # current train/dev partition
+        X_i = X.iloc[tri_o]
+        y_i = y[tri_o]
+        split_on_i = split_on[tri_o]
+        
+        for tri_i, tei_i in gss_i.split(X_i, y_i, split_on_i):
+
+            # all classes maintained in all partitions?
+            if nc:
+                nc_train = len(set(y[tri_o[tri_i]]))
+                nc_dev = len(set(y[tri_o[tei_i]]))
+                nc_test = len(set(y[tei_o]))
+                if min(nc_train, nc_dev, nc_test) < nc:
+                    continue
+
+            full_target_coverage = True
+                
+            sco = calc_split_score(test_i=tei_o,
+                                   stratify_on=stratify_on,
+                                   weight=weight, p_ref=p_ref,
+                                   N=N, test_size=test_size,
+                                   dev_i=tri_o[tei_i],
+                                   dev_size=dev_size_adj)
+            
+            if sco < best_sco:
+                best_sco = sco
+                test_i = tei_o
+                train_i = tri_o[tri_i]
+                dev_i = tri_o[tei_i]
+    
+    if test_i is None:
+        sys.exit(exit_message(full_target_coverage, "dev and test"))
+            
+    # matching info
+    info = {"score": best_sco,
+            "size_devset_in_spliton": dev_size,
+            "size_devset_in_X": np.round(len(dev_i) / N, 2),
+            "size_testset_in_spliton": test_size,
+            "size_testset_in_X": np.round(len(test_i) / N, 2)}
+    
+    for c in p_ref:
+        info[f"p_{c}_ref"] = p_ref[c]
+        info[f"p_{c}_dev"] = class_prob(stratify_on[c][dev_i])
+        info[f"p_{c}_test"] = class_prob(stratify_on[c][test_i])
+        
+    return train_i, dev_i, test_i, info
+
+
+def optimize_traintest_split(X, y, split_on, stratify_on, weight=None,
+                             test_size=.1, k=30, seed=42):
+
+    ''' optimize group-disjunct split which is guided by:
+    - disjunct split of values in SPLIT_ON
+    - stratification by all keys in STRATIFY_ON (targets and groupings)
+    - test set proportion in X should be close to test_size (which is the test
+      proportion in set(split_on))
+
+    Score to be minimized: (sum_v[w(v) * irad(v)] + w(d) * d) / (sum_v[w(v)] + w(d))
+    (v: variables to be stratified on
+    w(v): their weight
+    irad(v): information radius between reference distribution of classes in v
+        and test set distribution
+    N(v): number of stratification variables
+    d: absolute difference between test sizes of X and set(split_on)
+    w(d): its weight
+
+    Args:
+    X: (pd.DataFrame) of features/groupings for which best split
+      is to be calculated. Of shape (N, M)
+    y: (np.array) of targets of length N
+      if type(y[0]) in ["str", "int"]: y is assumed to be categorical, so that it is additionally
+      tested that all partitions cover all classes. Else y is assumed to be numeric and no
+      coverage test is done.
+    split_on: (np.array) list of length N with grouping variable (e.g. speaker IDs),
+      on which the group-disjunct split is to be performed. Must be categorical.
+    stratify_on: (dict) Dict-keys are variable names (targets and/or further groupings)
+      the split should be stratified on (groupings could e.g. be sex, age class, etc).
+      Dict-Values are np.array-s of length N that contain the variable values. All
+      variables must be categorical.
+    weight: (dict) weight for each variable in stratify_on. Defines their amount of
+      contribution to the optimization score. Uniform weighting by default. Additional
+      key: "size_diff" defines how test size diff should be weighted.
+    test_size: (float) test proportion in set(split_on), e.g. 10% of speakers to be held-out
+    k: (int) number of different splits to be tried out
+    seed: (int) random seed
+    Returns:
+    train_i: (np.array) train set indices in X
+    test_i: (np.array) test set indices in X
+    info: (dict) detail information about reference and achieved prob distributions
+        "size_testset_in_spliton": intended test_size
+        "size_testset_in_X": optimized test proportion in X
+        "p_ref_{c}": reference class distribution calculated from stratify_on[c]
+        "p_test_{c}": test set class distribution calculated from stratify_on[c][test_i]
+    '''
+
+    gss = GroupShuffleSplit(n_splits=k, test_size=test_size,
+                            random_state=seed)
+
+    # set weight defaults
+    if weight is None:
+        weight = {}
+    for c in stratify_on.keys():
+        if c not in weight:
+            weight[c] = 1
+    if "size_diff" not in weight:
+        weight["size_diff"] = 1
+            
+    # stratification reference distributions calculated on stratify_on
+    p_ref = {}
+    for c in stratify_on:
+        p_ref[c] = class_prob(stratify_on[c])
+        
+    # best train and test indices in X; best associated score
+    train_i, test_i, best_sco = None, None, np.inf
+
+    # data size
+    N = len(y)
+
+    # full target coverage in all partitions
+    full_target_coverage = False
+    
+    # categorical target: number of classes for coverage test
+    if is_categorical(y[0]):
+        nc = len(set(y))
+    else:
+        nc = None
+        
+    # brute-force optimization of SPLIT_ON split
+    for tri, tei in gss.split(X, y, split_on):
+
+        # all classes maintained in all partitions?
+        if nc:
+            nc_train = len(set(y[tri]))
+            nc_test = len(set(y[tei]))
+            if min(nc_train, nc_test) < nc:
+                continue
+
+        full_target_coverage = True
+            
+        sco = calc_split_score(tei, stratify_on, weight, p_ref, N, test_size)
+        if sco < best_sco:
+            train_i, test_i, best_sco = tri, tei, sco
+
+    if test_i is None:
+        sys.exit(exit_message(full_target_coverage))
+            
+    # matching info
+    info = {"score": best_sco,
+            "size_testset_in_spliton": test_size,
+            "size_testset_in_X": np.round(len(test_i) / N, 2)}
+    
+    for c in p_ref:
+        info[f"p_{c}_ref"] = p_ref[c]
+        info[f"p_{c}_test"] = class_prob(stratify_on[c][test_i])
+        
+    return train_i, test_i, info
+            
+    
+def calc_split_score(test_i, stratify_on, weight, p_ref, N, test_size,
+                     dev_i=None, dev_size=None):
+
+    ''' calculate split score based on class distribution IRADs and
+    differences in partition sizes of groups vs observations; smaller is better.
+    If dev_i and dev_size are not provided, the score is calculated for the train/test
+    split only. If they are provided the score is calculated for the train/dev/test split
+    Args:
+    test_i: (np.array) of test set indices
+    stratify_on: (dict) Dict-keys are variable names (targets and/or further groupings)
+      the split should be stratified on (groupings could e.g. be sex, age class, etc).
+      Dict-Values are np.array-s of length N that contain the variable values.
+    weight: (dict) weight for each variable in stratify_on. Additional
+      key: "size_diff" that weights the grouping vs observation level test set size difference
+    p_ref: (dict) reference class distributions for all variables in stratify_on
+    N: (int) size of underlying data set
+    test_size: (float) test proportion in value set of variable, the disjunct grouping
+       has been carried out
+    dev_i: (np.array) of dev test indices
+    dev_size: (float) dev proportion in value set of variable, the disjunct grouping
+       has been carried out (this value should have been adjusted after splitting off the
+       test set)
+    '''
+
+    if dev_i is None:
+        do_dev = False
+    else:
+        do_dev = True
+    
+    # dev and test set class distributions
+    p_test, p_dev = {}, {}
+    for c in p_ref:
+        p_test[c] = class_prob(stratify_on[c][test_i])
+        if do_dev:
+            p_dev[c] = class_prob(stratify_on[c][dev_i])
+
+    # score
+    sco, wgt = 0, 0
+
+    # IRADs (if p_test[c] or p_dec[c] do not contain
+    # all classes in p_ref[c], return INF)
+    for c in p_ref:
+        irad, full_coverage = calc_irad(p_ref[c], p_test[c])
+        if not full_coverage:
+            return np.inf
+        if do_dev:
+            irad_dev, full_coverage = calc_irad(p_ref[c], p_dev[c])
+            if not full_coverage:
+                return np.inf
+            irad = max(irad, irad_dev)
+        
+        sco += (weight[c] * irad)
+        wgt += weight[c]
+
+    # partition size difference groups vs observations
+    size_diff = np.abs(len(test_i) / N - test_size)
+    if do_dev:
+        size_diff_dev = np.abs(len(dev_i) / N - dev_size)
+        size_diff = max(size_diff, size_diff_dev)
+
+    sco += (weight["size_diff"] * size_diff)
+    wgt += weight["size_diff"]
+
+    sco /= wgt
+    
+    return sco
+    
+    
+def calc_irad(p1, p2):
+
+    ''' calculate information radius of prob dicts p1 and p2
+    Args:
+    p1, p2: (dict) of probabilities
+    Returns:
+    ir: (float) information radius
+    full_coverage: (bool) True if all elements in p1 occur in p2
+        and vice versa
+    '''
+ 
+    p, q = [], []
+    full_coverage = True
+    
+    for u in sorted(p1.keys()):
+
+        if u not in p2:
+            full_coverage = False
+            a = 0.0
+        else:
+            a = p2[u]
+        
+        p.append(p1[u])
+        q.append(a)
+
+    if full_coverage:
+        if len(p2.keys()) > len(p1.keys()):
+            full_coverage = False
+            
+    irad = ssp.distance.jensenshannon(p, q)
+
+    return irad, full_coverage
+
+
+def class_prob(y):
+
+    ''' returns class probabilities in y
+    Args:
+    y (array-like) of classes
+    Returns:
+    p (dict) assigning to each class in Y its maximum likelihood
+    '''
+    
+    p = {}
+    N = len(y)
+    c = Counter(y)
+    for x in c:
+        p[x] = c[x] / N
+
+    return p
+
+
+def is_categorical(x):
+
+    ''' returns True if type of x is in str or int*,
+    else False '''
+
+    if type(x) in [str, int, np.int16, np.int32, np.int64,
+                   np.uint8, np.uint16, np.uint32]:
+        return True
+    return False
+
+
+def dummy_variable(X, columns, specs=None, squeeze_classes=False):
+
+    '''
+    creates dummy variable from binned numeric columns that can be used
+    later for stratification etc.
+
+    Args:
+    X: (pd.DataFrame)
+    columns: (str or list) of numeric column names
+    specs: (dict or str)
+       if nested dict: keys are column names with subdict that contains the
+           arguments for binning(), i.e. n_bins and lower_boundaries
+    squeeze_classes: (boolean) further squeeze classes by sorting the digits
+        within the string.
+        Example: from binning of 3 columns, each into 2 bins, we got
+                 "000", "100", "010", "001", "110", "101", "011", "111".
+                 These classes are further squeezed by within-string sorting:
+                 "000", "001", "011", "111"
+    
+    Returns:
+    y: (list) of class strings of length X.shape[0]
+
+    '''
+
+    df_bin = pd.DataFrame()
+    if specs is None:
+        specs = {}
+    if type(columns) is str:
+        columns = [columns]
+        
+    # bin columns
+    for col in columns:
+        if col not in X.columns:
+            sys.exit(f"column {col} not in dataframe")
+        if col in specs:
+            kwargs = specs[col]
+        else:
+            kwargs = {"nbins": 2}
+        yc = binning(X[col].to_numpy(), **kwargs)
+        df_bin[col] = yc.astype(str)
+
+    # concatenate
+    df_bin["binvar"] = ""
+    for col in columns:
+        df_bin["binvar"] += df_bin[col]
+
+    # squeeze
+    if squeeze_classes:
+        def squeezing(x):
+            return "".join(sorted(x))
+        df_bin["binvar"] = df_bin["binvar"].apply(squeezing)
+
+    y = df_bin["binvar"].tolist()
+    return y
+    
+
+def binning(y, nbins=3, lower_boundaries=None):
+
+    ''' 
+    bins numeric array y either intrinsically into nbins classes
+    based on an equidistant percentile split, or extrinsically
+    by using the lower_boundaries values.
+
+    Args:
+    y: (np.array) with numeric data
+    nbins: (int) number of bins
+    lower_boundaries: (list) of lower bin boundaries.
+      If provided nbins will be ignored and y is binned
+      extrinsically. The first value of lower_boundaries
+      is always corrected not to be higher than min(y).
+    Returns:
+    yc: (np.array) with bin IDs (integers from 0 to nbins-1)
+    '''
+    
+    # intrinsic binning by equidistant percentiles
+    if lower_boundaries is None:
+        prct = np.linspace(0, 100, nbins+1)
+        lower_boundaries = np.percentile(y, prct)
+        lower_boundaries = lower_boundaries[0:nbins]
+    else:
+        # make sure that entire range of y is covered
+        lower_boundaries[0] = min(lower_boundaries[0], np.min(y))
+        
+    # binned array
+    yc = np.zeros(len(y), dtype=int)
+    for i in range(1, len(lower_boundaries)):
+        yc[y >= lower_boundaries[i]] = i
+
+    return yc
+
+
+def optimize_testset_split(X, y, split_on, stratify_on, weight=None,
+                           test_size=.1, k=30, seed=42):
+
+    ''' backward compatibility '''
+    return optimize_traintest_split(X, y, split_on, stratify_on,
+                                    weight, test_size, k, seed)
+
+
+def exit_message(full_target_coverage, infx="test"):
+
+    if not full_target_coverage:
+        return "not all partitions contain all target classes. What you can do:\n" \
+            "(1) increase your dev and/or test partition, or\n" \
+            "(2) reduce the amount of target classes by merging some of them."
+    
+    return f"\n:-o No {infx} set split found. Reason is, that for at least one of the\n" \
+        f"stratification variables not all its values can make it into the {infx} set.\n" \
+        f"This happens e.g. if the {infx} set size is chosen too small or\n" \
+        "if the (multidimensional) distribution of the stratification\n" \
+        "variables is sparse. What you can do:\n" \
+        "(1) remove a variable from this stratification, or\n" \
+        "(2) merge classes within a variable to increase the per class probabilities, or\n" \
+        f"(3) increase the {infx} set size, or\n" \
+        "(4) increase the number of different splits (if it was small, say < 10, before), or\n" \
+        "(5) in case your target is numeric and you have added a binned target array to the\n" \
+        "    stratification variables: reduce the number of bins.\n" \
+        "Good luck!\n"
diff --git a/1.3.0/trainDevTestSplit.py b/1.3.0/trainDevTestSplit.py
new file mode 100644
index 0000000..a57b57c
--- /dev/null
+++ b/1.3.0/trainDevTestSplit.py
@@ -0,0 +1,52 @@
+# import json
+import pandas as pd
+import audb
+from split_utils import optimize_traindevtest_split, binning
+
+
+def split_df(df):
+    # seed, dev and test proportion, number of different splits
+    seed = 42
+    dev_size = 0.2
+    test_size = 0.2
+    k = 30
+
+    # targets
+    age = df["age"].to_numpy()
+    age = binning(age, nbins=5)
+
+    # on which variable to split
+    speaker = df["speaker"].to_numpy()
+
+    # on which variables (targets, groupings) to stratify
+    stratif_vars = {
+        "age": age,
+        "gender": df["gender"].to_numpy(),
+    }
+
+    # weights for all stratify_on variables and
+    # and for dev and test proportion match. Give target
+    # variable AGE more weight than groupings.
+    weight = {"emotion": 2, "gender": 1, "size_diff": 10}
+
+    # find optimal dev and test indices DEV_I and TEST_I in DF
+    # info: dict with goodness of split information
+    train_i, dev_i, test_i, info = optimize_traindevtest_split(
+        X=df,
+        y=age,
+        split_on=speaker,
+        stratify_on=stratif_vars,
+        weight=weight,
+        dev_size=dev_size,
+        test_size=test_size,
+        k=k,
+        seed=seed,
+    )
+
+    print("dev split of DF:")
+    print(df.iloc[dev_i])
+    print("dev split of target variable:")
+    print(age[dev_i])
+    print("goodness of split:")
+    print(info)
+    return (df.iloc[train_i], df.iloc[dev_i], df.iloc[test_i])
diff --git a/1.3.0/util.py b/1.3.0/util.py
new file mode 100644
index 0000000..7b80c50
--- /dev/null
+++ b/1.3.0/util.py
@@ -0,0 +1,63 @@
+import pandas as pd
+import matplotlib.pyplot as plt
+
+num_workers = 8
+
+
+# plot sex distribution, age and duration
+def describe_df(df, file_path):
+    title = f"# samples: {df.shape[0]}, # speakers: {df.speaker.nunique()}"
+    if "duration" in df:
+        fig, axes = plt.subplots(nrows=2, ncols=2)
+        df["age"].plot(kind="hist", ax=axes[0, 0], title="age")
+
+        #    df["duration"].plot(kind="hist", ax=axes[0, 1], title="duration")
+        df.groupby("gender")["speaker"].nunique().plot(kind="pie", ax=axes[1, 0])
+        df_speakers = pd.DataFrame()
+        pd.options.mode.chained_assignment = None  # default='warn'
+        for s in df.speaker.unique():
+            df_speaker = df[df.speaker == s]
+            df_speaker["samplenum"] = df_speaker.shape[0]
+            df_speakers = pd.concat([df_speakers, df_speaker.head(1)])
+        df_speakers["samplenum"].value_counts().sort_values().plot(
+            kind="bar",
+            stacked=True,
+            title=f"samples per speaker",
+            rot=0,
+            ax=axes[1, 1],
+        )
+    else:
+        fig, axes = plt.subplots(nrows=1, ncols=3)
+        df["age"].plot(kind="hist", ax=axes[0], title="age")
+        df.groupby("gender")["speaker"].nunique().plot(kind="pie", ax=axes[1])
+        df_speakers = pd.DataFrame()
+        pd.options.mode.chained_assignment = None  # default='warn'
+        for s in df.speaker.unique():
+            df_speaker = df[df.speaker == s]
+            df_speaker["samplenum"] = df_speaker.shape[0]
+            df_speakers = pd.concat([df_speakers, df_speaker.head(1)])
+        df_speakers["samplenum"].value_counts().sort_values().plot(
+            kind="bar",
+            stacked=True,
+            title=f"samples per speaker",
+            rot=0,
+            ax=axes[2],
+        )
+
+    fig.suptitle(title)
+    plt.tight_layout()
+    fig.savefig(file_path)
+
+
+def limit_speakers(df, max=20):
+    """
+    Limit the number of samples per speaker to max.
+    """
+    df_ret = pd.DataFrame()
+    for s in df.speaker.unique():
+        s_df = df[df["speaker"].eq(s)]
+        if s_df.shape[0] < max:
+            df_ret = pd.concat([df_ret, s_df])
+        else:
+            df_ret = pd.concat([df_ret, s_df.sample(max)])
+    return df_ret

From 23330a7df549caa0762e82f996fa1c0f1df1259c Mon Sep 17 00:00:00 2001
From: FBurkhardt <fburkhardt@audeering.com>
Date: Tue, 19 Mar 2024 14:07:11 +0100
Subject: [PATCH 03/10] update

---
 1.3.0/requirements.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/1.3.0/requirements.txt b/1.3.0/requirements.txt
index 69203a9..b606b8c 100644
--- a/1.3.0/requirements.txt
+++ b/1.3.0/requirements.txt
@@ -1,7 +1,8 @@
 pandas
 matplotlib
 seaborn
-jupyter 
+scipy
+scikit-learn
 audb
 audeer
 audformat

From 8e6626586a7d3f8a26e0728e1590623a2c2b90d5 Mon Sep 17 00:00:00 2001
From: FBurkhardt <fburkhardt@audeering.com>
Date: Tue, 19 Mar 2024 14:13:10 +0100
Subject: [PATCH 04/10] update

---
 1.3.0/create.py | 9 +++++++--
 CHANGELOG.md    | 8 ++++++++
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/1.3.0/create.py b/1.3.0/create.py
index 63313ce..41fa80d 100644
--- a/1.3.0/create.py
+++ b/1.3.0/create.py
@@ -1,11 +1,15 @@
-import os
+"""
+Add age.[train|dev|test] tables with speaker information
+consists of randomly selected 20 emotionally neutral samples per speaker
+all tables being age/gender balanced
+"""
+
 import random
 import pandas as pd
 
 import audb
 import audeer
 import audformat
-import audiofile
 import util
 import trainDevTestSplit
 
@@ -78,6 +82,7 @@ def main():
 
     db.save(build_dir)
 
+    print(db)
 
 if __name__ == "__main__":
     main()
diff --git a/CHANGELOG.md b/CHANGELOG.md
index d35504b..8301806 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,14 @@
 Changelog
 =========
 
+Version 1.3.0 (2024/03/19)
+--------------------------
+
+* Added: age.[train|dev|test] tables with speaker information
+*   consists of randomly selected 20 emotionally neutral samples per speaker
+*   all tables being age/gender balanced
+
+
 Version 1.2.0 (2023/04/06)
 --------------------------
 

From 958a09192fccc40135a11292c3a60c61bebecee8 Mon Sep 17 00:00:00 2001
From: FBurkhardt <fburkhardt@audeering.com>
Date: Tue, 19 Mar 2024 14:35:02 +0100
Subject: [PATCH 05/10] update

---
 1.3.0/create.py | 52 ++++++++++++++++++++++++++++++++++---------------
 1 file changed, 36 insertions(+), 16 deletions(-)

diff --git a/1.3.0/create.py b/1.3.0/create.py
index 41fa80d..99cf8e6 100644
--- a/1.3.0/create.py
+++ b/1.3.0/create.py
@@ -52,33 +52,53 @@ def main():
     df = df[df["corrupted"] != True]
     audeer.mkdir(image_dir)
     util.describe_df(df, f"{image_dir}all.png")
+    # make a dataframe for 20 samples per speaker
+    df_lim = util.limit_speakers(df)
+    util.describe_df(df_lim, f"{image_dir}limited.png")
+    # make a dataframe for emotionally neutral samples
     df["emotion"] = df_emo["emotion.0"].values
-    df = df[df.emotion.isin(["neutral"])]
-    util.describe_df(df, f"{image_dir}all_neutral.png")
-    df = util.limit_speakers(df)
-    util.describe_df(df, f"{image_dir}all_limited.png")
+    df_neut = df[df.emotion.isin(["neutral"])]
+    util.describe_df(df_neut, f"{image_dir}neutral.png")
+    # make a dataframe for emotionally neutral samples , limited to 20 samples
+    df_neut_lim = util.limit_speakers(df_neut)
+    util.describe_df(df_neut_lim, f"{image_dir}neutral_limited.png")
 
-    # create split sets
-    splits = {}
-    df_train, df_dev, df_test = trainDevTestSplit.split_df(df)
-    splits["train"] = df_train
-    splits["dev"] = df_dev
-    splits["test"] = df_test
+    # create split sets for samples from all emotions
+    splits_emo = {}
+    df_train, df_dev, df_test = trainDevTestSplit.split_df(df_lim)
+    splits_emo["train"] = df_train
+    splits_emo["dev"] = df_dev
+    splits_emo["test"] = df_test
+    # create split sets for neutral samples
+    splits_neut = {}
+    df_train, df_dev, df_test = trainDevTestSplit.split_df(df_neut_lim)
+    splits_neut["train"] = df_train
+    splits_neut["dev"] = df_dev
+    splits_neut["test"] = df_test
     # plot distributions
-    for split in ["train", "dev", "test"]:
+    for split in splits_emo.keys():
         print(f"split: {split}")
-        util.describe_df(splits[split], f"{image_dir}{split}.png")
+        util.describe_df(splits_emo[split], f"{image_dir}{split}.png")
+        util.describe_df(splits_neut[split], f"{image_dir}{split}_neut.png")
 
     # fill the database with new tables
     age_tables_name = "age."
-    for split in splits.keys():
+    age_tables_emotional_name = "age.emotional."
+    for split in splits_emo.keys():
         db[f"{age_tables_name}{split}"] = audformat.Table(
-            splits[split].index,
-            description=f"Table selected for age and binary gender balance from the emotionally neutral samples, max 20 samples per speaker.",
+            splits_neut[split].index,
+            description=f"Table selected for age and binary gender balance from the emotionally neutral samples,  limited to 20 samples per speaker.",
         )
         for field in ["speaker"]:
             db[f"{age_tables_name}{split}"][field] = audformat.Column(scheme_id=field)
-            db[f"{age_tables_name}{split}"][field].set(splits[split][field])
+            db[f"{age_tables_name}{split}"][field].set(splits_neut[split][field])
+        db[f"{age_tables_emotional_name}{split}"] = audformat.Table(
+            splits_emo[split].index,
+            description=f"Table selected for age and binary gender balance from all samples, limited to 20 samples per speaker.",
+        )
+        for field in ["speaker"]:
+            db[f"{age_tables_emotional_name}{split}"][field] = audformat.Column(scheme_id=field)
+            db[f"{age_tables_emotional_name}{split}"][field].set(splits_emo[split][field])
 
     db.save(build_dir)
 

From 731f694270b56eaa1ff710d80fce649d015ee16e Mon Sep 17 00:00:00 2001
From: Felix Burkhardt <fxburk@gmail.com>
Date: Tue, 16 Apr 2024 14:58:47 +0200
Subject: [PATCH 06/10] Update 1.3.0/create.py

Co-authored-by: Hagen Wierstorf <hwierstorf@audeering.com>
---
 1.3.0/create.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/1.3.0/create.py b/1.3.0/create.py
index 99cf8e6..c1070ce 100644
--- a/1.3.0/create.py
+++ b/1.3.0/create.py
@@ -23,7 +23,7 @@ def main():
     name = "crema-d"
     previous_version = "1.2.0"
 
-    build_dir = "../build"
+    build_dir = "./build"
     build_dir = audeer.mkdir(build_dir)
 
     audb.load_to(

From 7dd26f8dbb5de08d8f12f62ca53628fa29ef2361 Mon Sep 17 00:00:00 2001
From: Felix Burkhardt <fxburk@gmail.com>
Date: Tue, 16 Apr 2024 14:59:02 +0200
Subject: [PATCH 07/10] Update 1.3.0/publish.py

Co-authored-by: Hagen Wierstorf <hwierstorf@audeering.com>
---
 1.3.0/publish.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/1.3.0/publish.py b/1.3.0/publish.py
index c585f30..dae7bb7 100644
--- a/1.3.0/publish.py
+++ b/1.3.0/publish.py
@@ -2,7 +2,7 @@
 
 previous_version = '1.2.0'
 version = '1.3.0'
-build_dir = '../build'
+build_dir = './build'
 
 repository = audb.Repository(
     name="data-public",

From 06669d015195e0eb7a61290bd0f8bd0ce2b1599b Mon Sep 17 00:00:00 2001
From: FBurkhardt <fburkhardt@audeering.com>
Date: Tue, 16 Apr 2024 17:34:40 +0200
Subject: [PATCH 08/10] update

---
 1.3.0/requirements.txt.lock | 134 ++++++++++++++++++++++++++++++++++++
 1 file changed, 134 insertions(+)
 create mode 100644 1.3.0/requirements.txt.lock

diff --git a/1.3.0/requirements.txt.lock b/1.3.0/requirements.txt.lock
new file mode 100644
index 0000000..245b109
--- /dev/null
+++ b/1.3.0/requirements.txt.lock
@@ -0,0 +1,134 @@
+#
+# This file is autogenerated by pip-compile with Python 3.10
+# by the following command:
+#
+#    pip-compile --output-file=requirements.txt.lock requirements.txt
+#
+--index-url https://artifactory.audeering.com/artifactory/api/pypi/pypi/simple
+
+audb==1.6.5
+    # via -r requirements.txt
+audbackend[artifactory]==1.0.2
+    # via audb
+audeer==2.0.0
+    # via
+    #   -r requirements.txt
+    #   audb
+    #   audbackend
+    #   audformat
+    #   audiofile
+    #   audobject
+audformat==1.1.2
+    # via
+    #   -r requirements.txt
+    #   audb
+audiofile==1.4.0
+    # via
+    #   audb
+    #   audformat
+audmath==1.4.0
+    # via audiofile
+audobject==0.7.11
+    # via audb
+audresample==1.3.3
+    # via audb
+certifi==2024.2.2
+    # via requests
+cffi==1.16.0
+    # via soundfile
+charset-normalizer==3.3.2
+    # via requests
+contourpy==1.2.1
+    # via matplotlib
+cycler==0.12.1
+    # via matplotlib
+dohq-artifactory==0.10.0
+    # via audbackend
+filelock==3.13.4
+    # via audb
+fonttools==4.51.0
+    # via matplotlib
+idna==3.7
+    # via requests
+importlib-metadata==7.1.0
+    # via audobject
+iso-639==0.4.5
+    # via audformat
+iso3166==2.1.1
+    # via audformat
+joblib==1.4.0
+    # via scikit-learn
+kiwisolver==1.4.5
+    # via matplotlib
+matplotlib==3.8.4
+    # via
+    #   -r requirements.txt
+    #   seaborn
+numpy==1.26.4
+    # via
+    #   audiofile
+    #   audmath
+    #   audresample
+    #   contourpy
+    #   matplotlib
+    #   pandas
+    #   scikit-learn
+    #   scipy
+    #   seaborn
+oyaml==1.0
+    # via
+    #   audb
+    #   audformat
+    #   audobject
+packaging==24.0
+    # via
+    #   audobject
+    #   matplotlib
+pandas==2.2.2
+    # via
+    #   -r requirements.txt
+    #   audformat
+    #   seaborn
+pillow==10.3.0
+    # via matplotlib
+pycparser==2.22
+    # via cffi
+pyjwt==2.8.0
+    # via dohq-artifactory
+pyparsing==3.1.2
+    # via matplotlib
+python-dateutil==2.9.0.post0
+    # via
+    #   dohq-artifactory
+    #   matplotlib
+    #   pandas
+pytz==2024.1
+    # via pandas
+pyyaml==6.0.1
+    # via
+    #   audformat
+    #   oyaml
+requests==2.31.0
+    # via dohq-artifactory
+scikit-learn==1.4.2
+    # via -r requirements.txt
+scipy==1.13.0
+    # via
+    #   -r requirements.txt
+    #   scikit-learn
+seaborn==0.13.2
+    # via -r requirements.txt
+six==1.16.0
+    # via python-dateutil
+soundfile==0.12.1
+    # via audiofile
+threadpoolctl==3.4.0
+    # via scikit-learn
+tqdm==4.66.2
+    # via audeer
+tzdata==2024.1
+    # via pandas
+urllib3==2.2.1
+    # via requests
+zipp==3.18.1
+    # via importlib-metadata

From 62de36d064d00d7689819ce0dcc6ae65b8894fbb Mon Sep 17 00:00:00 2001
From: FBurkhardt <fburkhardt@audeering.com>
Date: Tue, 16 Apr 2024 19:16:15 +0200
Subject: [PATCH 09/10] update

---
 1.3.0/README.md | 66 +++++++++++++++++++++++++++++++++++++++++++++++++
 1.3.0/create.py | 30 +++++++++++++---------
 1.3.0/util.py   | 65 ++++++++++++++++--------------------------------
 3 files changed, 106 insertions(+), 55 deletions(-)
 create mode 100644 1.3.0/README.md

diff --git a/1.3.0/README.md b/1.3.0/README.md
new file mode 100644
index 0000000..ab81d99
--- /dev/null
+++ b/1.3.0/README.md
@@ -0,0 +1,66 @@
+This creates new age and gender train, dev and test sets from the database,
+for neutral and emotional samples.
+
+The following files are included:
+* *create.py* generate a new database with splits
+* *publish.py* publish the new database to with audb
+* *split_utils.py* utilities to stratify slits
+* *trainDevTestSplit.py* helper functions for split_utils
+* *util.py* general helper functions
+* *requirements.txt* collection of packages that are needed
+
+To generate the new database, you 
+* set up a new environment
+* install the packages
+* call python create.py
+* inspect the result folder
+* call python publish.py
+
+
+The names of the new splits are
+
+```
+  age.dev:
+    type: filewise
+    description: Table selected for age and binary gender balance from the emotionally
+      neutral samples,  limited to 20 samples per speaker.
+    columns:
+      speaker: {scheme_id: speaker}
+  age.emotional.dev:
+    type: filewise
+    description: Table selected for age and binary gender balance from all samples,
+      limited to 20 samples per speaker.
+    columns:
+      speaker: {scheme_id: speaker}
+  age.emotional.test:
+    type: filewise
+    description: Table selected for age and binary gender balance from all samples,
+      limited to 20 samples per speaker.
+    columns:
+      speaker: {scheme_id: speaker}
+  age.emotional.train:
+    type: filewise
+    description: Table selected for age and binary gender balance from all samples,
+      limited to 20 samples per speaker.
+    columns:
+      speaker: {scheme_id: speaker}
+  age.test:
+    type: filewise
+    description: Table selected for age and binary gender balance from the emotionally
+      neutral samples,  limited to 20 samples per speaker.
+    columns:
+      speaker: {scheme_id: speaker}
+  age.train:
+    type: filewise
+    description: Table selected for age and binary gender balance from the emotionally
+      neutral samples,  limited to 20 samples per speaker.
+    columns:
+      speaker: {scheme_id: speaker}
+```
+To access the age and gender of the train split samples with emotional texts you could do
+```
+df = db["age.emotional.train"].get()
+df["gender"] = db["files"]["speaker"].get(map="sex")
+df["age"] = db["files"]["speaker"].get(map="age").astype("int")
+```
+
diff --git a/1.3.0/create.py b/1.3.0/create.py
index c1070ce..8f44fa5 100644
--- a/1.3.0/create.py
+++ b/1.3.0/create.py
@@ -6,6 +6,9 @@
 
 import random
 import pandas as pd
+import matplotlib
+import matplotlib.pyplot as plt
+import seaborn as sns
 
 import audb
 import audeer
@@ -50,18 +53,13 @@ def main():
     #    df["duration"] = df.index.to_series().map(lambda x: audiofile.duration(x))
     df = df[df["gender"] != "other"]
     df = df[df["corrupted"] != True]
-    audeer.mkdir(image_dir)
-    util.describe_df(df, f"{image_dir}all.png")
     # make a dataframe for 20 samples per speaker
     df_lim = util.limit_speakers(df)
-    util.describe_df(df_lim, f"{image_dir}limited.png")
     # make a dataframe for emotionally neutral samples
     df["emotion"] = df_emo["emotion.0"].values
     df_neut = df[df.emotion.isin(["neutral"])]
-    util.describe_df(df_neut, f"{image_dir}neutral.png")
     # make a dataframe for emotionally neutral samples , limited to 20 samples
     df_neut_lim = util.limit_speakers(df_neut)
-    util.describe_df(df_neut_lim, f"{image_dir}neutral_limited.png")
 
     # create split sets for samples from all emotions
     splits_emo = {}
@@ -75,11 +73,6 @@ def main():
     splits_neut["train"] = df_train
     splits_neut["dev"] = df_dev
     splits_neut["test"] = df_test
-    # plot distributions
-    for split in splits_emo.keys():
-        print(f"split: {split}")
-        util.describe_df(splits_emo[split], f"{image_dir}{split}.png")
-        util.describe_df(splits_neut[split], f"{image_dir}{split}_neut.png")
 
     # fill the database with new tables
     age_tables_name = "age."
@@ -101,8 +94,23 @@ def main():
             db[f"{age_tables_emotional_name}{split}"][field].set(splits_emo[split][field])
 
     db.save(build_dir)
-
     print(db)
 
+
+
+    print("testing:")
+    res_dir = audeer.mkdir("results")
+    for split in splits_neut.keys():
+        df = db[f"{age_tables_name}{split}"].get()
+        df["gender"] = db["files"]["speaker"].get(map="sex")
+        df["age"] = db["files"]["speaker"].get(map="age").astype("int")
+        sn = df["speaker"].nunique()
+        print(f"new {split}: {df.shape[0]}, {sn}")
+
+        util.distribution(df, split)
+        plt.tight_layout()
+        plt.savefig(f"{res_dir}/{split}.png")
+        plt.close()
+
 if __name__ == "__main__":
     main()
diff --git a/1.3.0/util.py b/1.3.0/util.py
index 7b80c50..90109e0 100644
--- a/1.3.0/util.py
+++ b/1.3.0/util.py
@@ -1,52 +1,29 @@
 import pandas as pd
+import matplotlib
 import matplotlib.pyplot as plt
+import seaborn as sns
 
 num_workers = 8
 
-
-# plot sex distribution, age and duration
-def describe_df(df, file_path):
-    title = f"# samples: {df.shape[0]}, # speakers: {df.speaker.nunique()}"
-    if "duration" in df:
-        fig, axes = plt.subplots(nrows=2, ncols=2)
-        df["age"].plot(kind="hist", ax=axes[0, 0], title="age")
-
-        #    df["duration"].plot(kind="hist", ax=axes[0, 1], title="duration")
-        df.groupby("gender")["speaker"].nunique().plot(kind="pie", ax=axes[1, 0])
-        df_speakers = pd.DataFrame()
-        pd.options.mode.chained_assignment = None  # default='warn'
-        for s in df.speaker.unique():
-            df_speaker = df[df.speaker == s]
-            df_speaker["samplenum"] = df_speaker.shape[0]
-            df_speakers = pd.concat([df_speakers, df_speaker.head(1)])
-        df_speakers["samplenum"].value_counts().sort_values().plot(
-            kind="bar",
-            stacked=True,
-            title=f"samples per speaker",
-            rot=0,
-            ax=axes[1, 1],
-        )
-    else:
-        fig, axes = plt.subplots(nrows=1, ncols=3)
-        df["age"].plot(kind="hist", ax=axes[0], title="age")
-        df.groupby("gender")["speaker"].nunique().plot(kind="pie", ax=axes[1])
-        df_speakers = pd.DataFrame()
-        pd.options.mode.chained_assignment = None  # default='warn'
-        for s in df.speaker.unique():
-            df_speaker = df[df.speaker == s]
-            df_speaker["samplenum"] = df_speaker.shape[0]
-            df_speakers = pd.concat([df_speakers, df_speaker.head(1)])
-        df_speakers["samplenum"].value_counts().sort_values().plot(
-            kind="bar",
-            stacked=True,
-            title=f"samples per speaker",
-            rot=0,
-            ax=axes[2],
-        )
-
-    fig.suptitle(title)
-    plt.tight_layout()
-    fig.savefig(file_path)
+def distribution(df, split):
+    sns.histplot(
+        # df[df.gender == gender]["age"].astype("float32"),
+        data = df,
+        x = "age",
+        hue = "gender",
+        common_bins=False,
+        stat="frequency",
+        kde=True,
+        edgecolor=None,
+        kde_kws={"cut": 3},  # hard code like in distplot()
+    )
+    plt.grid(alpha=0.4)
+    sns.despine()
+    plt.xlabel("age")
+    plt.title(f"Frequency of samples for {split}")
+    # Force y ticks at integer locations
+    ax = plt.gca()
+    ax.yaxis.set_major_locator(matplotlib.ticker.MaxNLocator(integer=True))
 
 
 def limit_speakers(df, max=20):

From 4419d706f09aee6ca40e6f853ab8b59c80974b9d Mon Sep 17 00:00:00 2001
From: Felix Burkhardt <fxburk@gmail.com>
Date: Wed, 17 Apr 2024 09:22:13 +0200
Subject: [PATCH 10/10] Update CHANGELOG.md

Co-authored-by: Hagen Wierstorf <hwierstorf@audeering.com>
---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8301806..ebf049b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,7 +1,7 @@
 Changelog
 =========
 
-Version 1.3.0 (2024/03/19)
+Version 1.3.0 (2024/04/17)
 --------------------------
 
 * Added: age.[train|dev|test] tables with speaker information