From d4e78098f040bbcdb42bde7b3e586b4b908d8667 Mon Sep 17 00:00:00 2001 From: FBurkhardt Date: Thu, 25 Jan 2024 14:36:07 +0100 Subject: [PATCH 01/10] initial --- 1.3.0/create.py | 44 ++++++++++++++++++++++++++++++++++++++++++ 1.3.0/publish.py | 19 ++++++++++++++++++ 1.3.0/requirements.txt | 8 ++++++++ 3 files changed, 71 insertions(+) create mode 100644 1.3.0/create.py create mode 100644 1.3.0/publish.py create mode 100644 1.3.0/requirements.txt diff --git a/1.3.0/create.py b/1.3.0/create.py new file mode 100644 index 0000000..1d538b1 --- /dev/null +++ b/1.3.0/create.py @@ -0,0 +1,44 @@ +""" +create.py + +add new age tables to the databases + +* per speaker use 10-20 random sentences of about 2-10 seconds +* split into train test dev, should be age/gender balanced +* make two versions: one with emotion acted samples and one with neutral +* so all in all 6 new tables + +""" + +import os +import pandas as pd +import audb +import audeer +import audformat + + +def main(): + name = "crema-d" + previous_version = "1.2.0" + + build_dir = "../build" + build_dir = audeer.mkdir(build_dir) + + audb.load_to( + build_dir, + name, + version=previous_version, + num_workers=8, + only_metadata=True, + verbose=True, + ) + db = audformat.Database.load(build_dir) + + splits = ["train", "dev", "test"] + for split in splits: + pass + db.save(build_dir) + + +if __name__ == "__main__": + main() diff --git a/1.3.0/publish.py b/1.3.0/publish.py new file mode 100644 index 0000000..ecc8b9f --- /dev/null +++ b/1.3.0/publish.py @@ -0,0 +1,19 @@ +import audb + +previous_version = "1.2.0" +version = "1.3.0" +build_dir = "./build" + +repository = audb.Repository( + name="data-public-local", + host="https://artifactory.audeering.com/artifactory", + backend="artifactory", +) +audb.publish( + build_dir, + version=version, + previous_version=previous_version, + repository=repository, + num_workers=1, + verbose=True, +) diff --git a/1.3.0/requirements.txt b/1.3.0/requirements.txt new file mode 100644 index 0000000..69203a9 --- /dev/null +++ b/1.3.0/requirements.txt @@ -0,0 +1,8 @@ +pandas +matplotlib +seaborn +jupyter +audb +audeer +audformat + From 58667d764f7b6be3f20136392287f767ca90d0f1 Mon Sep 17 00:00:00 2001 From: FBurkhardt Date: Tue, 19 Mar 2024 14:07:00 +0100 Subject: [PATCH 02/10] update --- 1.3.0/create.py | 69 +++-- 1.3.0/publish.py | 11 +- 1.3.0/split_utils.py | 514 +++++++++++++++++++++++++++++++++++++ 1.3.0/trainDevTestSplit.py | 52 ++++ 1.3.0/util.py | 63 +++++ 5 files changed, 689 insertions(+), 20 deletions(-) create mode 100644 1.3.0/split_utils.py create mode 100644 1.3.0/trainDevTestSplit.py create mode 100644 1.3.0/util.py diff --git a/1.3.0/create.py b/1.3.0/create.py index 1d538b1..63313ce 100644 --- a/1.3.0/create.py +++ b/1.3.0/create.py @@ -1,20 +1,18 @@ -""" -create.py - -add new age tables to the databases - -* per speaker use 10-20 random sentences of about 2-10 seconds -* split into train test dev, should be age/gender balanced -* make two versions: one with emotion acted samples and one with neutral -* so all in all 6 new tables - -""" - import os +import random import pandas as pd + import audb import audeer import audformat +import audiofile +import util +import trainDevTestSplit + +# make it reproducible +random.seed(23) + +image_dir = "images/" def main(): @@ -34,9 +32,50 @@ def main(): ) db = audformat.Database.load(build_dir) - splits = ["train", "dev", "test"] - for split in splits: - pass + # get age, gender and emotion info + df = db["files"].get() + df_emo = pd.concat( + [ + db["emotion.categories.train"].get(), + db["emotion.categories.dev"].get(), + db["emotion.categories.test"].get(), + ] + ) + df["age"] = db["files"]["speaker"].get(map="age").astype("int") + df["gender"] = db["files"]["speaker"].get(map="sex") + # df["duration"] = df.index.to_series().map(lambda x: audiofile.duration(x)) + df = df[df["gender"] != "other"] + df = df[df["corrupted"] != True] + audeer.mkdir(image_dir) + util.describe_df(df, f"{image_dir}all.png") + df["emotion"] = df_emo["emotion.0"].values + df = df[df.emotion.isin(["neutral"])] + util.describe_df(df, f"{image_dir}all_neutral.png") + df = util.limit_speakers(df) + util.describe_df(df, f"{image_dir}all_limited.png") + + # create split sets + splits = {} + df_train, df_dev, df_test = trainDevTestSplit.split_df(df) + splits["train"] = df_train + splits["dev"] = df_dev + splits["test"] = df_test + # plot distributions + for split in ["train", "dev", "test"]: + print(f"split: {split}") + util.describe_df(splits[split], f"{image_dir}{split}.png") + + # fill the database with new tables + age_tables_name = "age." + for split in splits.keys(): + db[f"{age_tables_name}{split}"] = audformat.Table( + splits[split].index, + description=f"Table selected for age and binary gender balance from the emotionally neutral samples, max 20 samples per speaker.", + ) + for field in ["speaker"]: + db[f"{age_tables_name}{split}"][field] = audformat.Column(scheme_id=field) + db[f"{age_tables_name}{split}"][field].set(splits[split][field]) + db.save(build_dir) diff --git a/1.3.0/publish.py b/1.3.0/publish.py index ecc8b9f..c585f30 100644 --- a/1.3.0/publish.py +++ b/1.3.0/publish.py @@ -1,14 +1,15 @@ import audb -previous_version = "1.2.0" -version = "1.3.0" -build_dir = "./build" +previous_version = '1.2.0' +version = '1.3.0' +build_dir = '../build' repository = audb.Repository( - name="data-public-local", - host="https://artifactory.audeering.com/artifactory", + name="data-public", + host="https://audeering.jfrog.io/artifactory", backend="artifactory", ) + audb.publish( build_dir, version=version, diff --git a/1.3.0/split_utils.py b/1.3.0/split_utils.py new file mode 100644 index 0000000..86f582a --- /dev/null +++ b/1.3.0/split_utils.py @@ -0,0 +1,514 @@ +from collections import Counter +import numpy as np +import pandas as pd +import scipy.spatial as ssp +from sklearn.model_selection import GroupShuffleSplit +import sys + + +def optimize_traindevtest_split(X, y, split_on, stratify_on, weight=None, dev_size=.1, + test_size=.1, k=30, seed=42): + + ''' optimize group-disjunct split into training, dev, and test set, which is guided by: + - disjunct split of values in SPLIT_ON + - stratification by all keys in STRATIFY_ON (targets and groupings) + - test set proportion in X should be close to test_size (which is the test + proportion in set(split_on)) + + Score to be minimized: (sum_v[w(v) * max_irad(v)] + w(d) * max_d) / (sum_v[w(v)] + w(d)) + (v: variables to be stratified on + w(v): their weight + max_irad(v): maximum information radius of reference distribution of classes in v and + - dev set distribution, + - test set distribution + N(v): number of stratification variables + max_d: maximum of absolute difference between dev and test sizes of X and set(split_on) + w(d): its weight + + Args: + X: (pd.DataFrame) of features/groupings for which best split + is to be calculated. Of shape (N, M) + y: (np.array) of targets of length N + if type(y[0]) in ["str", "int"]: y is assumed to be categorical, so that it is additionally + tested that all partitions cover all classes. Else y is assumed to be numeric and no + coverage test is done. + split_on: (np.array) list of length N with grouping variable (e.g. speaker IDs), + on which the group-disjunct split is to be performed. Must be categorical. + stratify_on: (dict) Dict-keys are variable names (targets and/or further groupings) + the split should be stratified on (groupings could e.g. be sex, age class, etc). + Dict-Values are np.array-s of length N that contain the variable values. All + variables must be categorical. + weight: (dict) weight for each variable in stratify_on. Defines their amount of + contribution to the optimization score. Uniform weighting by default. Additional + key: "size_diff" defines how the corresponding size differences should be weighted. + dev_size: (float) proportion in set(split_on) for dev set, e.g. 10% of speakers + to be held-out + test_size: (float) test proportion in set(split_on) for test set + k: (int) number of different splits to be tried out + seed: (int) random seed + Returns: + train_i: (np.array) train set indices in X + dev_i: (np.array) dev set indices in X + test_i: (np.array) test set indices in X + info: (dict) detail information about reference and achieved prob distributions + "dev_size_in_spliton": intended grouping dev_size + "dev_size_in_X": optimized dev proportion of observations in X + "test_size_in_spliton": intended grouping test_size + "test_size_in_X": optimized test proportion of observations in X + "p_ref_{c}": reference class distribution calculated from stratify_on[c] + "p_dev_{c}": dev set class distribution calculated from stratify_on[c][dev_i] + "p_test_{c}": test set class distribution calculated from stratify_on[c][test_i] + ''' + + # data size + N = len(y) + + # categorical target: number of classes for coverage test + if is_categorical(y[0]): + nc = len(set(y)) + else: + nc = None + + # adjusted dev_size after having split off the test set + dev_size_adj = (dev_size * N) / (N - test_size * N) + + # split all into train/dev vs test + gss_o = GroupShuffleSplit(n_splits=k, test_size=test_size, + random_state=seed) + + # split train/dev into train vs dev + gss_i = GroupShuffleSplit(n_splits=k, test_size=dev_size_adj, + random_state=seed) + + # set weight defaults + if weight is None: + weight = {} + for c in stratify_on.keys(): + if c not in weight: + weight[c] = 1 + if "size_diff" not in weight: + weight["size_diff"] = 1 + + # stratification reference distributions calculated on stratify_on + p_ref = {} + for c in stratify_on: + p_ref[c] = class_prob(stratify_on[c]) + + # best train/dev/test indices in X; best associated score + train_i, dev_i, test_i, best_sco = None, None, None, np.inf + + # full target coverage in all partitions + full_target_coverage = False + + # brute-force optimization of SPLIT_ON split + # outer loop *_o: splitting into train/dev and test + # inner loop *_i: spltting into train and dev + for tri_o, tei_o in gss_o.split(X, y, split_on): + + # current train/dev partition + X_i = X.iloc[tri_o] + y_i = y[tri_o] + split_on_i = split_on[tri_o] + + for tri_i, tei_i in gss_i.split(X_i, y_i, split_on_i): + + # all classes maintained in all partitions? + if nc: + nc_train = len(set(y[tri_o[tri_i]])) + nc_dev = len(set(y[tri_o[tei_i]])) + nc_test = len(set(y[tei_o])) + if min(nc_train, nc_dev, nc_test) < nc: + continue + + full_target_coverage = True + + sco = calc_split_score(test_i=tei_o, + stratify_on=stratify_on, + weight=weight, p_ref=p_ref, + N=N, test_size=test_size, + dev_i=tri_o[tei_i], + dev_size=dev_size_adj) + + if sco < best_sco: + best_sco = sco + test_i = tei_o + train_i = tri_o[tri_i] + dev_i = tri_o[tei_i] + + if test_i is None: + sys.exit(exit_message(full_target_coverage, "dev and test")) + + # matching info + info = {"score": best_sco, + "size_devset_in_spliton": dev_size, + "size_devset_in_X": np.round(len(dev_i) / N, 2), + "size_testset_in_spliton": test_size, + "size_testset_in_X": np.round(len(test_i) / N, 2)} + + for c in p_ref: + info[f"p_{c}_ref"] = p_ref[c] + info[f"p_{c}_dev"] = class_prob(stratify_on[c][dev_i]) + info[f"p_{c}_test"] = class_prob(stratify_on[c][test_i]) + + return train_i, dev_i, test_i, info + + +def optimize_traintest_split(X, y, split_on, stratify_on, weight=None, + test_size=.1, k=30, seed=42): + + ''' optimize group-disjunct split which is guided by: + - disjunct split of values in SPLIT_ON + - stratification by all keys in STRATIFY_ON (targets and groupings) + - test set proportion in X should be close to test_size (which is the test + proportion in set(split_on)) + + Score to be minimized: (sum_v[w(v) * irad(v)] + w(d) * d) / (sum_v[w(v)] + w(d)) + (v: variables to be stratified on + w(v): their weight + irad(v): information radius between reference distribution of classes in v + and test set distribution + N(v): number of stratification variables + d: absolute difference between test sizes of X and set(split_on) + w(d): its weight + + Args: + X: (pd.DataFrame) of features/groupings for which best split + is to be calculated. Of shape (N, M) + y: (np.array) of targets of length N + if type(y[0]) in ["str", "int"]: y is assumed to be categorical, so that it is additionally + tested that all partitions cover all classes. Else y is assumed to be numeric and no + coverage test is done. + split_on: (np.array) list of length N with grouping variable (e.g. speaker IDs), + on which the group-disjunct split is to be performed. Must be categorical. + stratify_on: (dict) Dict-keys are variable names (targets and/or further groupings) + the split should be stratified on (groupings could e.g. be sex, age class, etc). + Dict-Values are np.array-s of length N that contain the variable values. All + variables must be categorical. + weight: (dict) weight for each variable in stratify_on. Defines their amount of + contribution to the optimization score. Uniform weighting by default. Additional + key: "size_diff" defines how test size diff should be weighted. + test_size: (float) test proportion in set(split_on), e.g. 10% of speakers to be held-out + k: (int) number of different splits to be tried out + seed: (int) random seed + Returns: + train_i: (np.array) train set indices in X + test_i: (np.array) test set indices in X + info: (dict) detail information about reference and achieved prob distributions + "size_testset_in_spliton": intended test_size + "size_testset_in_X": optimized test proportion in X + "p_ref_{c}": reference class distribution calculated from stratify_on[c] + "p_test_{c}": test set class distribution calculated from stratify_on[c][test_i] + ''' + + gss = GroupShuffleSplit(n_splits=k, test_size=test_size, + random_state=seed) + + # set weight defaults + if weight is None: + weight = {} + for c in stratify_on.keys(): + if c not in weight: + weight[c] = 1 + if "size_diff" not in weight: + weight["size_diff"] = 1 + + # stratification reference distributions calculated on stratify_on + p_ref = {} + for c in stratify_on: + p_ref[c] = class_prob(stratify_on[c]) + + # best train and test indices in X; best associated score + train_i, test_i, best_sco = None, None, np.inf + + # data size + N = len(y) + + # full target coverage in all partitions + full_target_coverage = False + + # categorical target: number of classes for coverage test + if is_categorical(y[0]): + nc = len(set(y)) + else: + nc = None + + # brute-force optimization of SPLIT_ON split + for tri, tei in gss.split(X, y, split_on): + + # all classes maintained in all partitions? + if nc: + nc_train = len(set(y[tri])) + nc_test = len(set(y[tei])) + if min(nc_train, nc_test) < nc: + continue + + full_target_coverage = True + + sco = calc_split_score(tei, stratify_on, weight, p_ref, N, test_size) + if sco < best_sco: + train_i, test_i, best_sco = tri, tei, sco + + if test_i is None: + sys.exit(exit_message(full_target_coverage)) + + # matching info + info = {"score": best_sco, + "size_testset_in_spliton": test_size, + "size_testset_in_X": np.round(len(test_i) / N, 2)} + + for c in p_ref: + info[f"p_{c}_ref"] = p_ref[c] + info[f"p_{c}_test"] = class_prob(stratify_on[c][test_i]) + + return train_i, test_i, info + + +def calc_split_score(test_i, stratify_on, weight, p_ref, N, test_size, + dev_i=None, dev_size=None): + + ''' calculate split score based on class distribution IRADs and + differences in partition sizes of groups vs observations; smaller is better. + If dev_i and dev_size are not provided, the score is calculated for the train/test + split only. If they are provided the score is calculated for the train/dev/test split + Args: + test_i: (np.array) of test set indices + stratify_on: (dict) Dict-keys are variable names (targets and/or further groupings) + the split should be stratified on (groupings could e.g. be sex, age class, etc). + Dict-Values are np.array-s of length N that contain the variable values. + weight: (dict) weight for each variable in stratify_on. Additional + key: "size_diff" that weights the grouping vs observation level test set size difference + p_ref: (dict) reference class distributions for all variables in stratify_on + N: (int) size of underlying data set + test_size: (float) test proportion in value set of variable, the disjunct grouping + has been carried out + dev_i: (np.array) of dev test indices + dev_size: (float) dev proportion in value set of variable, the disjunct grouping + has been carried out (this value should have been adjusted after splitting off the + test set) + ''' + + if dev_i is None: + do_dev = False + else: + do_dev = True + + # dev and test set class distributions + p_test, p_dev = {}, {} + for c in p_ref: + p_test[c] = class_prob(stratify_on[c][test_i]) + if do_dev: + p_dev[c] = class_prob(stratify_on[c][dev_i]) + + # score + sco, wgt = 0, 0 + + # IRADs (if p_test[c] or p_dec[c] do not contain + # all classes in p_ref[c], return INF) + for c in p_ref: + irad, full_coverage = calc_irad(p_ref[c], p_test[c]) + if not full_coverage: + return np.inf + if do_dev: + irad_dev, full_coverage = calc_irad(p_ref[c], p_dev[c]) + if not full_coverage: + return np.inf + irad = max(irad, irad_dev) + + sco += (weight[c] * irad) + wgt += weight[c] + + # partition size difference groups vs observations + size_diff = np.abs(len(test_i) / N - test_size) + if do_dev: + size_diff_dev = np.abs(len(dev_i) / N - dev_size) + size_diff = max(size_diff, size_diff_dev) + + sco += (weight["size_diff"] * size_diff) + wgt += weight["size_diff"] + + sco /= wgt + + return sco + + +def calc_irad(p1, p2): + + ''' calculate information radius of prob dicts p1 and p2 + Args: + p1, p2: (dict) of probabilities + Returns: + ir: (float) information radius + full_coverage: (bool) True if all elements in p1 occur in p2 + and vice versa + ''' + + p, q = [], [] + full_coverage = True + + for u in sorted(p1.keys()): + + if u not in p2: + full_coverage = False + a = 0.0 + else: + a = p2[u] + + p.append(p1[u]) + q.append(a) + + if full_coverage: + if len(p2.keys()) > len(p1.keys()): + full_coverage = False + + irad = ssp.distance.jensenshannon(p, q) + + return irad, full_coverage + + +def class_prob(y): + + ''' returns class probabilities in y + Args: + y (array-like) of classes + Returns: + p (dict) assigning to each class in Y its maximum likelihood + ''' + + p = {} + N = len(y) + c = Counter(y) + for x in c: + p[x] = c[x] / N + + return p + + +def is_categorical(x): + + ''' returns True if type of x is in str or int*, + else False ''' + + if type(x) in [str, int, np.int16, np.int32, np.int64, + np.uint8, np.uint16, np.uint32]: + return True + return False + + +def dummy_variable(X, columns, specs=None, squeeze_classes=False): + + ''' + creates dummy variable from binned numeric columns that can be used + later for stratification etc. + + Args: + X: (pd.DataFrame) + columns: (str or list) of numeric column names + specs: (dict or str) + if nested dict: keys are column names with subdict that contains the + arguments for binning(), i.e. n_bins and lower_boundaries + squeeze_classes: (boolean) further squeeze classes by sorting the digits + within the string. + Example: from binning of 3 columns, each into 2 bins, we got + "000", "100", "010", "001", "110", "101", "011", "111". + These classes are further squeezed by within-string sorting: + "000", "001", "011", "111" + + Returns: + y: (list) of class strings of length X.shape[0] + + ''' + + df_bin = pd.DataFrame() + if specs is None: + specs = {} + if type(columns) is str: + columns = [columns] + + # bin columns + for col in columns: + if col not in X.columns: + sys.exit(f"column {col} not in dataframe") + if col in specs: + kwargs = specs[col] + else: + kwargs = {"nbins": 2} + yc = binning(X[col].to_numpy(), **kwargs) + df_bin[col] = yc.astype(str) + + # concatenate + df_bin["binvar"] = "" + for col in columns: + df_bin["binvar"] += df_bin[col] + + # squeeze + if squeeze_classes: + def squeezing(x): + return "".join(sorted(x)) + df_bin["binvar"] = df_bin["binvar"].apply(squeezing) + + y = df_bin["binvar"].tolist() + return y + + +def binning(y, nbins=3, lower_boundaries=None): + + ''' + bins numeric array y either intrinsically into nbins classes + based on an equidistant percentile split, or extrinsically + by using the lower_boundaries values. + + Args: + y: (np.array) with numeric data + nbins: (int) number of bins + lower_boundaries: (list) of lower bin boundaries. + If provided nbins will be ignored and y is binned + extrinsically. The first value of lower_boundaries + is always corrected not to be higher than min(y). + Returns: + yc: (np.array) with bin IDs (integers from 0 to nbins-1) + ''' + + # intrinsic binning by equidistant percentiles + if lower_boundaries is None: + prct = np.linspace(0, 100, nbins+1) + lower_boundaries = np.percentile(y, prct) + lower_boundaries = lower_boundaries[0:nbins] + else: + # make sure that entire range of y is covered + lower_boundaries[0] = min(lower_boundaries[0], np.min(y)) + + # binned array + yc = np.zeros(len(y), dtype=int) + for i in range(1, len(lower_boundaries)): + yc[y >= lower_boundaries[i]] = i + + return yc + + +def optimize_testset_split(X, y, split_on, stratify_on, weight=None, + test_size=.1, k=30, seed=42): + + ''' backward compatibility ''' + return optimize_traintest_split(X, y, split_on, stratify_on, + weight, test_size, k, seed) + + +def exit_message(full_target_coverage, infx="test"): + + if not full_target_coverage: + return "not all partitions contain all target classes. What you can do:\n" \ + "(1) increase your dev and/or test partition, or\n" \ + "(2) reduce the amount of target classes by merging some of them." + + return f"\n:-o No {infx} set split found. Reason is, that for at least one of the\n" \ + f"stratification variables not all its values can make it into the {infx} set.\n" \ + f"This happens e.g. if the {infx} set size is chosen too small or\n" \ + "if the (multidimensional) distribution of the stratification\n" \ + "variables is sparse. What you can do:\n" \ + "(1) remove a variable from this stratification, or\n" \ + "(2) merge classes within a variable to increase the per class probabilities, or\n" \ + f"(3) increase the {infx} set size, or\n" \ + "(4) increase the number of different splits (if it was small, say < 10, before), or\n" \ + "(5) in case your target is numeric and you have added a binned target array to the\n" \ + " stratification variables: reduce the number of bins.\n" \ + "Good luck!\n" diff --git a/1.3.0/trainDevTestSplit.py b/1.3.0/trainDevTestSplit.py new file mode 100644 index 0000000..a57b57c --- /dev/null +++ b/1.3.0/trainDevTestSplit.py @@ -0,0 +1,52 @@ +# import json +import pandas as pd +import audb +from split_utils import optimize_traindevtest_split, binning + + +def split_df(df): + # seed, dev and test proportion, number of different splits + seed = 42 + dev_size = 0.2 + test_size = 0.2 + k = 30 + + # targets + age = df["age"].to_numpy() + age = binning(age, nbins=5) + + # on which variable to split + speaker = df["speaker"].to_numpy() + + # on which variables (targets, groupings) to stratify + stratif_vars = { + "age": age, + "gender": df["gender"].to_numpy(), + } + + # weights for all stratify_on variables and + # and for dev and test proportion match. Give target + # variable AGE more weight than groupings. + weight = {"emotion": 2, "gender": 1, "size_diff": 10} + + # find optimal dev and test indices DEV_I and TEST_I in DF + # info: dict with goodness of split information + train_i, dev_i, test_i, info = optimize_traindevtest_split( + X=df, + y=age, + split_on=speaker, + stratify_on=stratif_vars, + weight=weight, + dev_size=dev_size, + test_size=test_size, + k=k, + seed=seed, + ) + + print("dev split of DF:") + print(df.iloc[dev_i]) + print("dev split of target variable:") + print(age[dev_i]) + print("goodness of split:") + print(info) + return (df.iloc[train_i], df.iloc[dev_i], df.iloc[test_i]) diff --git a/1.3.0/util.py b/1.3.0/util.py new file mode 100644 index 0000000..7b80c50 --- /dev/null +++ b/1.3.0/util.py @@ -0,0 +1,63 @@ +import pandas as pd +import matplotlib.pyplot as plt + +num_workers = 8 + + +# plot sex distribution, age and duration +def describe_df(df, file_path): + title = f"# samples: {df.shape[0]}, # speakers: {df.speaker.nunique()}" + if "duration" in df: + fig, axes = plt.subplots(nrows=2, ncols=2) + df["age"].plot(kind="hist", ax=axes[0, 0], title="age") + + # df["duration"].plot(kind="hist", ax=axes[0, 1], title="duration") + df.groupby("gender")["speaker"].nunique().plot(kind="pie", ax=axes[1, 0]) + df_speakers = pd.DataFrame() + pd.options.mode.chained_assignment = None # default='warn' + for s in df.speaker.unique(): + df_speaker = df[df.speaker == s] + df_speaker["samplenum"] = df_speaker.shape[0] + df_speakers = pd.concat([df_speakers, df_speaker.head(1)]) + df_speakers["samplenum"].value_counts().sort_values().plot( + kind="bar", + stacked=True, + title=f"samples per speaker", + rot=0, + ax=axes[1, 1], + ) + else: + fig, axes = plt.subplots(nrows=1, ncols=3) + df["age"].plot(kind="hist", ax=axes[0], title="age") + df.groupby("gender")["speaker"].nunique().plot(kind="pie", ax=axes[1]) + df_speakers = pd.DataFrame() + pd.options.mode.chained_assignment = None # default='warn' + for s in df.speaker.unique(): + df_speaker = df[df.speaker == s] + df_speaker["samplenum"] = df_speaker.shape[0] + df_speakers = pd.concat([df_speakers, df_speaker.head(1)]) + df_speakers["samplenum"].value_counts().sort_values().plot( + kind="bar", + stacked=True, + title=f"samples per speaker", + rot=0, + ax=axes[2], + ) + + fig.suptitle(title) + plt.tight_layout() + fig.savefig(file_path) + + +def limit_speakers(df, max=20): + """ + Limit the number of samples per speaker to max. + """ + df_ret = pd.DataFrame() + for s in df.speaker.unique(): + s_df = df[df["speaker"].eq(s)] + if s_df.shape[0] < max: + df_ret = pd.concat([df_ret, s_df]) + else: + df_ret = pd.concat([df_ret, s_df.sample(max)]) + return df_ret From 23330a7df549caa0762e82f996fa1c0f1df1259c Mon Sep 17 00:00:00 2001 From: FBurkhardt Date: Tue, 19 Mar 2024 14:07:11 +0100 Subject: [PATCH 03/10] update --- 1.3.0/requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/1.3.0/requirements.txt b/1.3.0/requirements.txt index 69203a9..b606b8c 100644 --- a/1.3.0/requirements.txt +++ b/1.3.0/requirements.txt @@ -1,7 +1,8 @@ pandas matplotlib seaborn -jupyter +scipy +scikit-learn audb audeer audformat From 8e6626586a7d3f8a26e0728e1590623a2c2b90d5 Mon Sep 17 00:00:00 2001 From: FBurkhardt Date: Tue, 19 Mar 2024 14:13:10 +0100 Subject: [PATCH 04/10] update --- 1.3.0/create.py | 9 +++++++-- CHANGELOG.md | 8 ++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/1.3.0/create.py b/1.3.0/create.py index 63313ce..41fa80d 100644 --- a/1.3.0/create.py +++ b/1.3.0/create.py @@ -1,11 +1,15 @@ -import os +""" +Add age.[train|dev|test] tables with speaker information +consists of randomly selected 20 emotionally neutral samples per speaker +all tables being age/gender balanced +""" + import random import pandas as pd import audb import audeer import audformat -import audiofile import util import trainDevTestSplit @@ -78,6 +82,7 @@ def main(): db.save(build_dir) + print(db) if __name__ == "__main__": main() diff --git a/CHANGELOG.md b/CHANGELOG.md index d35504b..8301806 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,14 @@ Changelog ========= +Version 1.3.0 (2024/03/19) +-------------------------- + +* Added: age.[train|dev|test] tables with speaker information +* consists of randomly selected 20 emotionally neutral samples per speaker +* all tables being age/gender balanced + + Version 1.2.0 (2023/04/06) -------------------------- From 958a09192fccc40135a11292c3a60c61bebecee8 Mon Sep 17 00:00:00 2001 From: FBurkhardt Date: Tue, 19 Mar 2024 14:35:02 +0100 Subject: [PATCH 05/10] update --- 1.3.0/create.py | 52 ++++++++++++++++++++++++++++++++++--------------- 1 file changed, 36 insertions(+), 16 deletions(-) diff --git a/1.3.0/create.py b/1.3.0/create.py index 41fa80d..99cf8e6 100644 --- a/1.3.0/create.py +++ b/1.3.0/create.py @@ -52,33 +52,53 @@ def main(): df = df[df["corrupted"] != True] audeer.mkdir(image_dir) util.describe_df(df, f"{image_dir}all.png") + # make a dataframe for 20 samples per speaker + df_lim = util.limit_speakers(df) + util.describe_df(df_lim, f"{image_dir}limited.png") + # make a dataframe for emotionally neutral samples df["emotion"] = df_emo["emotion.0"].values - df = df[df.emotion.isin(["neutral"])] - util.describe_df(df, f"{image_dir}all_neutral.png") - df = util.limit_speakers(df) - util.describe_df(df, f"{image_dir}all_limited.png") + df_neut = df[df.emotion.isin(["neutral"])] + util.describe_df(df_neut, f"{image_dir}neutral.png") + # make a dataframe for emotionally neutral samples , limited to 20 samples + df_neut_lim = util.limit_speakers(df_neut) + util.describe_df(df_neut_lim, f"{image_dir}neutral_limited.png") - # create split sets - splits = {} - df_train, df_dev, df_test = trainDevTestSplit.split_df(df) - splits["train"] = df_train - splits["dev"] = df_dev - splits["test"] = df_test + # create split sets for samples from all emotions + splits_emo = {} + df_train, df_dev, df_test = trainDevTestSplit.split_df(df_lim) + splits_emo["train"] = df_train + splits_emo["dev"] = df_dev + splits_emo["test"] = df_test + # create split sets for neutral samples + splits_neut = {} + df_train, df_dev, df_test = trainDevTestSplit.split_df(df_neut_lim) + splits_neut["train"] = df_train + splits_neut["dev"] = df_dev + splits_neut["test"] = df_test # plot distributions - for split in ["train", "dev", "test"]: + for split in splits_emo.keys(): print(f"split: {split}") - util.describe_df(splits[split], f"{image_dir}{split}.png") + util.describe_df(splits_emo[split], f"{image_dir}{split}.png") + util.describe_df(splits_neut[split], f"{image_dir}{split}_neut.png") # fill the database with new tables age_tables_name = "age." - for split in splits.keys(): + age_tables_emotional_name = "age.emotional." + for split in splits_emo.keys(): db[f"{age_tables_name}{split}"] = audformat.Table( - splits[split].index, - description=f"Table selected for age and binary gender balance from the emotionally neutral samples, max 20 samples per speaker.", + splits_neut[split].index, + description=f"Table selected for age and binary gender balance from the emotionally neutral samples, limited to 20 samples per speaker.", ) for field in ["speaker"]: db[f"{age_tables_name}{split}"][field] = audformat.Column(scheme_id=field) - db[f"{age_tables_name}{split}"][field].set(splits[split][field]) + db[f"{age_tables_name}{split}"][field].set(splits_neut[split][field]) + db[f"{age_tables_emotional_name}{split}"] = audformat.Table( + splits_emo[split].index, + description=f"Table selected for age and binary gender balance from all samples, limited to 20 samples per speaker.", + ) + for field in ["speaker"]: + db[f"{age_tables_emotional_name}{split}"][field] = audformat.Column(scheme_id=field) + db[f"{age_tables_emotional_name}{split}"][field].set(splits_emo[split][field]) db.save(build_dir) From 731f694270b56eaa1ff710d80fce649d015ee16e Mon Sep 17 00:00:00 2001 From: Felix Burkhardt Date: Tue, 16 Apr 2024 14:58:47 +0200 Subject: [PATCH 06/10] Update 1.3.0/create.py Co-authored-by: Hagen Wierstorf --- 1.3.0/create.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/1.3.0/create.py b/1.3.0/create.py index 99cf8e6..c1070ce 100644 --- a/1.3.0/create.py +++ b/1.3.0/create.py @@ -23,7 +23,7 @@ def main(): name = "crema-d" previous_version = "1.2.0" - build_dir = "../build" + build_dir = "./build" build_dir = audeer.mkdir(build_dir) audb.load_to( From 7dd26f8dbb5de08d8f12f62ca53628fa29ef2361 Mon Sep 17 00:00:00 2001 From: Felix Burkhardt Date: Tue, 16 Apr 2024 14:59:02 +0200 Subject: [PATCH 07/10] Update 1.3.0/publish.py Co-authored-by: Hagen Wierstorf --- 1.3.0/publish.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/1.3.0/publish.py b/1.3.0/publish.py index c585f30..dae7bb7 100644 --- a/1.3.0/publish.py +++ b/1.3.0/publish.py @@ -2,7 +2,7 @@ previous_version = '1.2.0' version = '1.3.0' -build_dir = '../build' +build_dir = './build' repository = audb.Repository( name="data-public", From 06669d015195e0eb7a61290bd0f8bd0ce2b1599b Mon Sep 17 00:00:00 2001 From: FBurkhardt Date: Tue, 16 Apr 2024 17:34:40 +0200 Subject: [PATCH 08/10] update --- 1.3.0/requirements.txt.lock | 134 ++++++++++++++++++++++++++++++++++++ 1 file changed, 134 insertions(+) create mode 100644 1.3.0/requirements.txt.lock diff --git a/1.3.0/requirements.txt.lock b/1.3.0/requirements.txt.lock new file mode 100644 index 0000000..245b109 --- /dev/null +++ b/1.3.0/requirements.txt.lock @@ -0,0 +1,134 @@ +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile --output-file=requirements.txt.lock requirements.txt +# +--index-url https://artifactory.audeering.com/artifactory/api/pypi/pypi/simple + +audb==1.6.5 + # via -r requirements.txt +audbackend[artifactory]==1.0.2 + # via audb +audeer==2.0.0 + # via + # -r requirements.txt + # audb + # audbackend + # audformat + # audiofile + # audobject +audformat==1.1.2 + # via + # -r requirements.txt + # audb +audiofile==1.4.0 + # via + # audb + # audformat +audmath==1.4.0 + # via audiofile +audobject==0.7.11 + # via audb +audresample==1.3.3 + # via audb +certifi==2024.2.2 + # via requests +cffi==1.16.0 + # via soundfile +charset-normalizer==3.3.2 + # via requests +contourpy==1.2.1 + # via matplotlib +cycler==0.12.1 + # via matplotlib +dohq-artifactory==0.10.0 + # via audbackend +filelock==3.13.4 + # via audb +fonttools==4.51.0 + # via matplotlib +idna==3.7 + # via requests +importlib-metadata==7.1.0 + # via audobject +iso-639==0.4.5 + # via audformat +iso3166==2.1.1 + # via audformat +joblib==1.4.0 + # via scikit-learn +kiwisolver==1.4.5 + # via matplotlib +matplotlib==3.8.4 + # via + # -r requirements.txt + # seaborn +numpy==1.26.4 + # via + # audiofile + # audmath + # audresample + # contourpy + # matplotlib + # pandas + # scikit-learn + # scipy + # seaborn +oyaml==1.0 + # via + # audb + # audformat + # audobject +packaging==24.0 + # via + # audobject + # matplotlib +pandas==2.2.2 + # via + # -r requirements.txt + # audformat + # seaborn +pillow==10.3.0 + # via matplotlib +pycparser==2.22 + # via cffi +pyjwt==2.8.0 + # via dohq-artifactory +pyparsing==3.1.2 + # via matplotlib +python-dateutil==2.9.0.post0 + # via + # dohq-artifactory + # matplotlib + # pandas +pytz==2024.1 + # via pandas +pyyaml==6.0.1 + # via + # audformat + # oyaml +requests==2.31.0 + # via dohq-artifactory +scikit-learn==1.4.2 + # via -r requirements.txt +scipy==1.13.0 + # via + # -r requirements.txt + # scikit-learn +seaborn==0.13.2 + # via -r requirements.txt +six==1.16.0 + # via python-dateutil +soundfile==0.12.1 + # via audiofile +threadpoolctl==3.4.0 + # via scikit-learn +tqdm==4.66.2 + # via audeer +tzdata==2024.1 + # via pandas +urllib3==2.2.1 + # via requests +zipp==3.18.1 + # via importlib-metadata From 62de36d064d00d7689819ce0dcc6ae65b8894fbb Mon Sep 17 00:00:00 2001 From: FBurkhardt Date: Tue, 16 Apr 2024 19:16:15 +0200 Subject: [PATCH 09/10] update --- 1.3.0/README.md | 66 +++++++++++++++++++++++++++++++++++++++++++++++++ 1.3.0/create.py | 30 +++++++++++++--------- 1.3.0/util.py | 65 ++++++++++++++++-------------------------------- 3 files changed, 106 insertions(+), 55 deletions(-) create mode 100644 1.3.0/README.md diff --git a/1.3.0/README.md b/1.3.0/README.md new file mode 100644 index 0000000..ab81d99 --- /dev/null +++ b/1.3.0/README.md @@ -0,0 +1,66 @@ +This creates new age and gender train, dev and test sets from the database, +for neutral and emotional samples. + +The following files are included: +* *create.py* generate a new database with splits +* *publish.py* publish the new database to with audb +* *split_utils.py* utilities to stratify slits +* *trainDevTestSplit.py* helper functions for split_utils +* *util.py* general helper functions +* *requirements.txt* collection of packages that are needed + +To generate the new database, you +* set up a new environment +* install the packages +* call python create.py +* inspect the result folder +* call python publish.py + + +The names of the new splits are + +``` + age.dev: + type: filewise + description: Table selected for age and binary gender balance from the emotionally + neutral samples, limited to 20 samples per speaker. + columns: + speaker: {scheme_id: speaker} + age.emotional.dev: + type: filewise + description: Table selected for age and binary gender balance from all samples, + limited to 20 samples per speaker. + columns: + speaker: {scheme_id: speaker} + age.emotional.test: + type: filewise + description: Table selected for age and binary gender balance from all samples, + limited to 20 samples per speaker. + columns: + speaker: {scheme_id: speaker} + age.emotional.train: + type: filewise + description: Table selected for age and binary gender balance from all samples, + limited to 20 samples per speaker. + columns: + speaker: {scheme_id: speaker} + age.test: + type: filewise + description: Table selected for age and binary gender balance from the emotionally + neutral samples, limited to 20 samples per speaker. + columns: + speaker: {scheme_id: speaker} + age.train: + type: filewise + description: Table selected for age and binary gender balance from the emotionally + neutral samples, limited to 20 samples per speaker. + columns: + speaker: {scheme_id: speaker} +``` +To access the age and gender of the train split samples with emotional texts you could do +``` +df = db["age.emotional.train"].get() +df["gender"] = db["files"]["speaker"].get(map="sex") +df["age"] = db["files"]["speaker"].get(map="age").astype("int") +``` + diff --git a/1.3.0/create.py b/1.3.0/create.py index c1070ce..8f44fa5 100644 --- a/1.3.0/create.py +++ b/1.3.0/create.py @@ -6,6 +6,9 @@ import random import pandas as pd +import matplotlib +import matplotlib.pyplot as plt +import seaborn as sns import audb import audeer @@ -50,18 +53,13 @@ def main(): # df["duration"] = df.index.to_series().map(lambda x: audiofile.duration(x)) df = df[df["gender"] != "other"] df = df[df["corrupted"] != True] - audeer.mkdir(image_dir) - util.describe_df(df, f"{image_dir}all.png") # make a dataframe for 20 samples per speaker df_lim = util.limit_speakers(df) - util.describe_df(df_lim, f"{image_dir}limited.png") # make a dataframe for emotionally neutral samples df["emotion"] = df_emo["emotion.0"].values df_neut = df[df.emotion.isin(["neutral"])] - util.describe_df(df_neut, f"{image_dir}neutral.png") # make a dataframe for emotionally neutral samples , limited to 20 samples df_neut_lim = util.limit_speakers(df_neut) - util.describe_df(df_neut_lim, f"{image_dir}neutral_limited.png") # create split sets for samples from all emotions splits_emo = {} @@ -75,11 +73,6 @@ def main(): splits_neut["train"] = df_train splits_neut["dev"] = df_dev splits_neut["test"] = df_test - # plot distributions - for split in splits_emo.keys(): - print(f"split: {split}") - util.describe_df(splits_emo[split], f"{image_dir}{split}.png") - util.describe_df(splits_neut[split], f"{image_dir}{split}_neut.png") # fill the database with new tables age_tables_name = "age." @@ -101,8 +94,23 @@ def main(): db[f"{age_tables_emotional_name}{split}"][field].set(splits_emo[split][field]) db.save(build_dir) - print(db) + + + print("testing:") + res_dir = audeer.mkdir("results") + for split in splits_neut.keys(): + df = db[f"{age_tables_name}{split}"].get() + df["gender"] = db["files"]["speaker"].get(map="sex") + df["age"] = db["files"]["speaker"].get(map="age").astype("int") + sn = df["speaker"].nunique() + print(f"new {split}: {df.shape[0]}, {sn}") + + util.distribution(df, split) + plt.tight_layout() + plt.savefig(f"{res_dir}/{split}.png") + plt.close() + if __name__ == "__main__": main() diff --git a/1.3.0/util.py b/1.3.0/util.py index 7b80c50..90109e0 100644 --- a/1.3.0/util.py +++ b/1.3.0/util.py @@ -1,52 +1,29 @@ import pandas as pd +import matplotlib import matplotlib.pyplot as plt +import seaborn as sns num_workers = 8 - -# plot sex distribution, age and duration -def describe_df(df, file_path): - title = f"# samples: {df.shape[0]}, # speakers: {df.speaker.nunique()}" - if "duration" in df: - fig, axes = plt.subplots(nrows=2, ncols=2) - df["age"].plot(kind="hist", ax=axes[0, 0], title="age") - - # df["duration"].plot(kind="hist", ax=axes[0, 1], title="duration") - df.groupby("gender")["speaker"].nunique().plot(kind="pie", ax=axes[1, 0]) - df_speakers = pd.DataFrame() - pd.options.mode.chained_assignment = None # default='warn' - for s in df.speaker.unique(): - df_speaker = df[df.speaker == s] - df_speaker["samplenum"] = df_speaker.shape[0] - df_speakers = pd.concat([df_speakers, df_speaker.head(1)]) - df_speakers["samplenum"].value_counts().sort_values().plot( - kind="bar", - stacked=True, - title=f"samples per speaker", - rot=0, - ax=axes[1, 1], - ) - else: - fig, axes = plt.subplots(nrows=1, ncols=3) - df["age"].plot(kind="hist", ax=axes[0], title="age") - df.groupby("gender")["speaker"].nunique().plot(kind="pie", ax=axes[1]) - df_speakers = pd.DataFrame() - pd.options.mode.chained_assignment = None # default='warn' - for s in df.speaker.unique(): - df_speaker = df[df.speaker == s] - df_speaker["samplenum"] = df_speaker.shape[0] - df_speakers = pd.concat([df_speakers, df_speaker.head(1)]) - df_speakers["samplenum"].value_counts().sort_values().plot( - kind="bar", - stacked=True, - title=f"samples per speaker", - rot=0, - ax=axes[2], - ) - - fig.suptitle(title) - plt.tight_layout() - fig.savefig(file_path) +def distribution(df, split): + sns.histplot( + # df[df.gender == gender]["age"].astype("float32"), + data = df, + x = "age", + hue = "gender", + common_bins=False, + stat="frequency", + kde=True, + edgecolor=None, + kde_kws={"cut": 3}, # hard code like in distplot() + ) + plt.grid(alpha=0.4) + sns.despine() + plt.xlabel("age") + plt.title(f"Frequency of samples for {split}") + # Force y ticks at integer locations + ax = plt.gca() + ax.yaxis.set_major_locator(matplotlib.ticker.MaxNLocator(integer=True)) def limit_speakers(df, max=20): From 4419d706f09aee6ca40e6f853ab8b59c80974b9d Mon Sep 17 00:00:00 2001 From: Felix Burkhardt Date: Wed, 17 Apr 2024 09:22:13 +0200 Subject: [PATCH 10/10] Update CHANGELOG.md Co-authored-by: Hagen Wierstorf --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8301806..ebf049b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,7 @@ Changelog ========= -Version 1.3.0 (2024/03/19) +Version 1.3.0 (2024/04/17) -------------------------- * Added: age.[train|dev|test] tables with speaker information