Skip to content
This repository has been archived by the owner on Dec 9, 2024. It is now read-only.

Commit

Permalink
Merge pull request #2 from audeering/add_age_tables
Browse files Browse the repository at this point in the history
Add age tables
  • Loading branch information
felixbur authored Apr 17, 2024
2 parents 0c05335 + 4419d70 commit 92753e7
Show file tree
Hide file tree
Showing 9 changed files with 959 additions and 0 deletions.
66 changes: 66 additions & 0 deletions 1.3.0/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
This creates new age and gender train, dev and test sets from the database,
for neutral and emotional samples.

The following files are included:
* *create.py* generate a new database with splits
* *publish.py* publish the new database to with audb
* *split_utils.py* utilities to stratify slits
* *trainDevTestSplit.py* helper functions for split_utils
* *util.py* general helper functions
* *requirements.txt* collection of packages that are needed

To generate the new database, you
* set up a new environment
* install the packages
* call python create.py
* inspect the result folder
* call python publish.py


The names of the new splits are

```
age.dev:
type: filewise
description: Table selected for age and binary gender balance from the emotionally
neutral samples, limited to 20 samples per speaker.
columns:
speaker: {scheme_id: speaker}
age.emotional.dev:
type: filewise
description: Table selected for age and binary gender balance from all samples,
limited to 20 samples per speaker.
columns:
speaker: {scheme_id: speaker}
age.emotional.test:
type: filewise
description: Table selected for age and binary gender balance from all samples,
limited to 20 samples per speaker.
columns:
speaker: {scheme_id: speaker}
age.emotional.train:
type: filewise
description: Table selected for age and binary gender balance from all samples,
limited to 20 samples per speaker.
columns:
speaker: {scheme_id: speaker}
age.test:
type: filewise
description: Table selected for age and binary gender balance from the emotionally
neutral samples, limited to 20 samples per speaker.
columns:
speaker: {scheme_id: speaker}
age.train:
type: filewise
description: Table selected for age and binary gender balance from the emotionally
neutral samples, limited to 20 samples per speaker.
columns:
speaker: {scheme_id: speaker}
```
To access the age and gender of the train split samples with emotional texts you could do
```
df = db["age.emotional.train"].get()
df["gender"] = db["files"]["speaker"].get(map="sex")
df["age"] = db["files"]["speaker"].get(map="age").astype("int")
```

116 changes: 116 additions & 0 deletions 1.3.0/create.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
"""
Add age.[train|dev|test] tables with speaker information
consists of randomly selected 20 emotionally neutral samples per speaker
all tables being age/gender balanced
"""

import random
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

import audb
import audeer
import audformat
import util
import trainDevTestSplit

# make it reproducible
random.seed(23)

image_dir = "images/"


def main():
name = "crema-d"
previous_version = "1.2.0"

build_dir = "./build"
build_dir = audeer.mkdir(build_dir)

audb.load_to(
build_dir,
name,
version=previous_version,
num_workers=8,
only_metadata=True,
verbose=True,
)
db = audformat.Database.load(build_dir)

# get age, gender and emotion info
df = db["files"].get()
df_emo = pd.concat(
[
db["emotion.categories.train"].get(),
db["emotion.categories.dev"].get(),
db["emotion.categories.test"].get(),
]
)
df["age"] = db["files"]["speaker"].get(map="age").astype("int")
df["gender"] = db["files"]["speaker"].get(map="sex")
# df["duration"] = df.index.to_series().map(lambda x: audiofile.duration(x))
df = df[df["gender"] != "other"]
df = df[df["corrupted"] != True]
# make a dataframe for 20 samples per speaker
df_lim = util.limit_speakers(df)
# make a dataframe for emotionally neutral samples
df["emotion"] = df_emo["emotion.0"].values
df_neut = df[df.emotion.isin(["neutral"])]
# make a dataframe for emotionally neutral samples , limited to 20 samples
df_neut_lim = util.limit_speakers(df_neut)

# create split sets for samples from all emotions
splits_emo = {}
df_train, df_dev, df_test = trainDevTestSplit.split_df(df_lim)
splits_emo["train"] = df_train
splits_emo["dev"] = df_dev
splits_emo["test"] = df_test
# create split sets for neutral samples
splits_neut = {}
df_train, df_dev, df_test = trainDevTestSplit.split_df(df_neut_lim)
splits_neut["train"] = df_train
splits_neut["dev"] = df_dev
splits_neut["test"] = df_test

# fill the database with new tables
age_tables_name = "age."
age_tables_emotional_name = "age.emotional."
for split in splits_emo.keys():
db[f"{age_tables_name}{split}"] = audformat.Table(
splits_neut[split].index,
description=f"Table selected for age and binary gender balance from the emotionally neutral samples, limited to 20 samples per speaker.",
)
for field in ["speaker"]:
db[f"{age_tables_name}{split}"][field] = audformat.Column(scheme_id=field)
db[f"{age_tables_name}{split}"][field].set(splits_neut[split][field])
db[f"{age_tables_emotional_name}{split}"] = audformat.Table(
splits_emo[split].index,
description=f"Table selected for age and binary gender balance from all samples, limited to 20 samples per speaker.",
)
for field in ["speaker"]:
db[f"{age_tables_emotional_name}{split}"][field] = audformat.Column(scheme_id=field)
db[f"{age_tables_emotional_name}{split}"][field].set(splits_emo[split][field])

db.save(build_dir)
print(db)



print("testing:")
res_dir = audeer.mkdir("results")
for split in splits_neut.keys():
df = db[f"{age_tables_name}{split}"].get()
df["gender"] = db["files"]["speaker"].get(map="sex")
df["age"] = db["files"]["speaker"].get(map="age").astype("int")
sn = df["speaker"].nunique()
print(f"new {split}: {df.shape[0]}, {sn}")

util.distribution(df, split)
plt.tight_layout()
plt.savefig(f"{res_dir}/{split}.png")
plt.close()

if __name__ == "__main__":
main()
20 changes: 20 additions & 0 deletions 1.3.0/publish.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import audb

previous_version = '1.2.0'
version = '1.3.0'
build_dir = './build'

repository = audb.Repository(
name="data-public",
host="https://audeering.jfrog.io/artifactory",
backend="artifactory",
)

audb.publish(
build_dir,
version=version,
previous_version=previous_version,
repository=repository,
num_workers=1,
verbose=True,
)
9 changes: 9 additions & 0 deletions 1.3.0/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
pandas
matplotlib
seaborn
scipy
scikit-learn
audb
audeer
audformat

134 changes: 134 additions & 0 deletions 1.3.0/requirements.txt.lock
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
#
# This file is autogenerated by pip-compile with Python 3.10
# by the following command:
#
# pip-compile --output-file=requirements.txt.lock requirements.txt
#
--index-url https://artifactory.audeering.com/artifactory/api/pypi/pypi/simple

audb==1.6.5
# via -r requirements.txt
audbackend[artifactory]==1.0.2
# via audb
audeer==2.0.0
# via
# -r requirements.txt
# audb
# audbackend
# audformat
# audiofile
# audobject
audformat==1.1.2
# via
# -r requirements.txt
# audb
audiofile==1.4.0
# via
# audb
# audformat
audmath==1.4.0
# via audiofile
audobject==0.7.11
# via audb
audresample==1.3.3
# via audb
certifi==2024.2.2
# via requests
cffi==1.16.0
# via soundfile
charset-normalizer==3.3.2
# via requests
contourpy==1.2.1
# via matplotlib
cycler==0.12.1
# via matplotlib
dohq-artifactory==0.10.0
# via audbackend
filelock==3.13.4
# via audb
fonttools==4.51.0
# via matplotlib
idna==3.7
# via requests
importlib-metadata==7.1.0
# via audobject
iso-639==0.4.5
# via audformat
iso3166==2.1.1
# via audformat
joblib==1.4.0
# via scikit-learn
kiwisolver==1.4.5
# via matplotlib
matplotlib==3.8.4
# via
# -r requirements.txt
# seaborn
numpy==1.26.4
# via
# audiofile
# audmath
# audresample
# contourpy
# matplotlib
# pandas
# scikit-learn
# scipy
# seaborn
oyaml==1.0
# via
# audb
# audformat
# audobject
packaging==24.0
# via
# audobject
# matplotlib
pandas==2.2.2
# via
# -r requirements.txt
# audformat
# seaborn
pillow==10.3.0
# via matplotlib
pycparser==2.22
# via cffi
pyjwt==2.8.0
# via dohq-artifactory
pyparsing==3.1.2
# via matplotlib
python-dateutil==2.9.0.post0
# via
# dohq-artifactory
# matplotlib
# pandas
pytz==2024.1
# via pandas
pyyaml==6.0.1
# via
# audformat
# oyaml
requests==2.31.0
# via dohq-artifactory
scikit-learn==1.4.2
# via -r requirements.txt
scipy==1.13.0
# via
# -r requirements.txt
# scikit-learn
seaborn==0.13.2
# via -r requirements.txt
six==1.16.0
# via python-dateutil
soundfile==0.12.1
# via audiofile
threadpoolctl==3.4.0
# via scikit-learn
tqdm==4.66.2
# via audeer
tzdata==2024.1
# via pandas
urllib3==2.2.1
# via requests
zipp==3.18.1
# via importlib-metadata
Loading

0 comments on commit 92753e7

Please sign in to comment.