Merge pull request #2 from audeering/add_age_tables

Add age tables
audeering · Apr 17, 2024 · 92753e7 · 92753e7
2 parents 0c05335 + 4419d70
commit 92753e7
Show file tree

Hide file tree

Showing 9 changed files with 959 additions and 0 deletions.
diff --git a/1.3.0/README.md b/1.3.0/README.md
@@ -0,0 +1,66 @@
+This creates new age and gender train, dev and test sets from the database,
+for neutral and emotional samples.
+
+The following files are included:
+* *create.py* generate a new database with splits
+* *publish.py* publish the new database to with audb
+* *split_utils.py* utilities to stratify slits
+* *trainDevTestSplit.py* helper functions for split_utils
+* *util.py* general helper functions
+* *requirements.txt* collection of packages that are needed
+
+To generate the new database, you 
+* set up a new environment
+* install the packages
+* call python create.py
+* inspect the result folder
+* call python publish.py
+
+
+The names of the new splits are
+
+```
+  age.dev:
+    type: filewise
+    description: Table selected for age and binary gender balance from the emotionally
+      neutral samples,  limited to 20 samples per speaker.
+    columns:
+      speaker: {scheme_id: speaker}
+  age.emotional.dev:
+    type: filewise
+    description: Table selected for age and binary gender balance from all samples,
+      limited to 20 samples per speaker.
+    columns:
+      speaker: {scheme_id: speaker}
+  age.emotional.test:
+    type: filewise
+    description: Table selected for age and binary gender balance from all samples,
+      limited to 20 samples per speaker.
+    columns:
+      speaker: {scheme_id: speaker}
+  age.emotional.train:
+    type: filewise
+    description: Table selected for age and binary gender balance from all samples,
+      limited to 20 samples per speaker.
+    columns:
+      speaker: {scheme_id: speaker}
+  age.test:
+    type: filewise
+    description: Table selected for age and binary gender balance from the emotionally
+      neutral samples,  limited to 20 samples per speaker.
+    columns:
+      speaker: {scheme_id: speaker}
+  age.train:
+    type: filewise
+    description: Table selected for age and binary gender balance from the emotionally
+      neutral samples,  limited to 20 samples per speaker.
+    columns:
+      speaker: {scheme_id: speaker}
+```
+To access the age and gender of the train split samples with emotional texts you could do
+```
+df = db["age.emotional.train"].get()
+df["gender"] = db["files"]["speaker"].get(map="sex")
+df["age"] = db["files"]["speaker"].get(map="age").astype("int")
+```
+
diff --git a/1.3.0/create.py b/1.3.0/create.py
@@ -0,0 +1,116 @@
+"""
+Add age.[train|dev|test] tables with speaker information
+consists of randomly selected 20 emotionally neutral samples per speaker
+all tables being age/gender balanced
+"""
+
+import random
+import pandas as pd
+import matplotlib
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+import audb
+import audeer
+import audformat
+import util
+import trainDevTestSplit
+
+# make it reproducible
+random.seed(23)
+
+image_dir = "images/"
+
+
+def main():
+    name = "crema-d"
+    previous_version = "1.2.0"
+
+    build_dir = "./build"
+    build_dir = audeer.mkdir(build_dir)
+
+    audb.load_to(
+        build_dir,
+        name,
+        version=previous_version,
+        num_workers=8,
+        only_metadata=True,
+        verbose=True,
+    )
+    db = audformat.Database.load(build_dir)
+
+    # get age, gender and emotion info
+    df = db["files"].get()
+    df_emo = pd.concat(
+        [
+            db["emotion.categories.train"].get(),
+            db["emotion.categories.dev"].get(),
+            db["emotion.categories.test"].get(),
+        ]
+    )
+    df["age"] = db["files"]["speaker"].get(map="age").astype("int")
+    df["gender"] = db["files"]["speaker"].get(map="sex")
+    #    df["duration"] = df.index.to_series().map(lambda x: audiofile.duration(x))
+    df = df[df["gender"] != "other"]
+    df = df[df["corrupted"] != True]
+    # make a dataframe for 20 samples per speaker
+    df_lim = util.limit_speakers(df)
+    # make a dataframe for emotionally neutral samples
+    df["emotion"] = df_emo["emotion.0"].values
+    df_neut = df[df.emotion.isin(["neutral"])]
+    # make a dataframe for emotionally neutral samples , limited to 20 samples
+    df_neut_lim = util.limit_speakers(df_neut)
+
+    # create split sets for samples from all emotions
+    splits_emo = {}
+    df_train, df_dev, df_test = trainDevTestSplit.split_df(df_lim)
+    splits_emo["train"] = df_train
+    splits_emo["dev"] = df_dev
+    splits_emo["test"] = df_test
+    # create split sets for neutral samples
+    splits_neut = {}
+    df_train, df_dev, df_test = trainDevTestSplit.split_df(df_neut_lim)
+    splits_neut["train"] = df_train
+    splits_neut["dev"] = df_dev
+    splits_neut["test"] = df_test
+
+    # fill the database with new tables
+    age_tables_name = "age."
+    age_tables_emotional_name = "age.emotional."
+    for split in splits_emo.keys():
+        db[f"{age_tables_name}{split}"] = audformat.Table(
+            splits_neut[split].index,
+            description=f"Table selected for age and binary gender balance from the emotionally neutral samples,  limited to 20 samples per speaker.",
+        )
+        for field in ["speaker"]:
+            db[f"{age_tables_name}{split}"][field] = audformat.Column(scheme_id=field)
+            db[f"{age_tables_name}{split}"][field].set(splits_neut[split][field])
+        db[f"{age_tables_emotional_name}{split}"] = audformat.Table(
+            splits_emo[split].index,
+            description=f"Table selected for age and binary gender balance from all samples, limited to 20 samples per speaker.",
+        )
+        for field in ["speaker"]:
+            db[f"{age_tables_emotional_name}{split}"][field] = audformat.Column(scheme_id=field)
+            db[f"{age_tables_emotional_name}{split}"][field].set(splits_emo[split][field])
+
+    db.save(build_dir)
+    print(db)
+
+
+
+    print("testing:")
+    res_dir = audeer.mkdir("results")
+    for split in splits_neut.keys():
+        df = db[f"{age_tables_name}{split}"].get()
+        df["gender"] = db["files"]["speaker"].get(map="sex")
+        df["age"] = db["files"]["speaker"].get(map="age").astype("int")
+        sn = df["speaker"].nunique()
+        print(f"new {split}: {df.shape[0]}, {sn}")
+
+        util.distribution(df, split)
+        plt.tight_layout()
+        plt.savefig(f"{res_dir}/{split}.png")
+        plt.close()
+
+if __name__ == "__main__":
+    main()
diff --git a/1.3.0/publish.py b/1.3.0/publish.py
@@ -0,0 +1,20 @@
+import audb
+
+previous_version = '1.2.0'
+version = '1.3.0'
+build_dir = './build'
+
+repository = audb.Repository(
+    name="data-public",
+    host="https://audeering.jfrog.io/artifactory",
+    backend="artifactory",
+)
+
+audb.publish(
+    build_dir,
+    version=version,
+    previous_version=previous_version,
+    repository=repository,
+    num_workers=1,
+    verbose=True,
+)
diff --git a/1.3.0/requirements.txt b/1.3.0/requirements.txt
@@ -0,0 +1,9 @@
+pandas
+matplotlib
+seaborn
+scipy
+scikit-learn
+audb
+audeer
+audformat
+
diff --git a/1.3.0/requirements.txt.lock b/1.3.0/requirements.txt.lock
@@ -0,0 +1,134 @@
+#
+# This file is autogenerated by pip-compile with Python 3.10
+# by the following command:
+#
+#    pip-compile --output-file=requirements.txt.lock requirements.txt
+#
+--index-url https://artifactory.audeering.com/artifactory/api/pypi/pypi/simple
+
+audb==1.6.5
+    # via -r requirements.txt
+audbackend[artifactory]==1.0.2
+    # via audb
+audeer==2.0.0
+    # via
+    #   -r requirements.txt
+    #   audb
+    #   audbackend
+    #   audformat
+    #   audiofile
+    #   audobject
+audformat==1.1.2
+    # via
+    #   -r requirements.txt
+    #   audb
+audiofile==1.4.0
+    # via
+    #   audb
+    #   audformat
+audmath==1.4.0
+    # via audiofile
+audobject==0.7.11
+    # via audb
+audresample==1.3.3
+    # via audb
+certifi==2024.2.2
+    # via requests
+cffi==1.16.0
+    # via soundfile
+charset-normalizer==3.3.2
+    # via requests
+contourpy==1.2.1
+    # via matplotlib
+cycler==0.12.1
+    # via matplotlib
+dohq-artifactory==0.10.0
+    # via audbackend
+filelock==3.13.4
+    # via audb
+fonttools==4.51.0
+    # via matplotlib
+idna==3.7
+    # via requests
+importlib-metadata==7.1.0
+    # via audobject
+iso-639==0.4.5
+    # via audformat
+iso3166==2.1.1
+    # via audformat
+joblib==1.4.0
+    # via scikit-learn
+kiwisolver==1.4.5
+    # via matplotlib
+matplotlib==3.8.4
+    # via
+    #   -r requirements.txt
+    #   seaborn
+numpy==1.26.4
+    # via
+    #   audiofile
+    #   audmath
+    #   audresample
+    #   contourpy
+    #   matplotlib
+    #   pandas
+    #   scikit-learn
+    #   scipy
+    #   seaborn
+oyaml==1.0
+    # via
+    #   audb
+    #   audformat
+    #   audobject
+packaging==24.0
+    # via
+    #   audobject
+    #   matplotlib
+pandas==2.2.2
+    # via
+    #   -r requirements.txt
+    #   audformat
+    #   seaborn
+pillow==10.3.0
+    # via matplotlib
+pycparser==2.22
+    # via cffi
+pyjwt==2.8.0
+    # via dohq-artifactory
+pyparsing==3.1.2
+    # via matplotlib
+python-dateutil==2.9.0.post0
+    # via
+    #   dohq-artifactory
+    #   matplotlib
+    #   pandas
+pytz==2024.1
+    # via pandas
+pyyaml==6.0.1
+    # via
+    #   audformat
+    #   oyaml
+requests==2.31.0
+    # via dohq-artifactory
+scikit-learn==1.4.2
+    # via -r requirements.txt
+scipy==1.13.0
+    # via
+    #   -r requirements.txt
+    #   scikit-learn
+seaborn==0.13.2
+    # via -r requirements.txt
+six==1.16.0
+    # via python-dateutil
+soundfile==0.12.1
+    # via audiofile
+threadpoolctl==3.4.0
+    # via scikit-learn
+tqdm==4.66.2
+    # via audeer
+tzdata==2024.1
+    # via pandas
+urllib3==2.2.1
+    # via requests
+zipp==3.18.1
+    # via importlib-metadata