Skip to content

Commit

Permalink
0.71.0
Browse files Browse the repository at this point in the history
  • Loading branch information
FBurkhardt committed Nov 22, 2023
1 parent b9f0627 commit 7b9bcb5
Show file tree
Hide file tree
Showing 5 changed files with 70 additions and 19 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
Changelog
=========

Version 0.71.0
--------------
* added explore visuals.
* all columns from databases should now be usable

Version 0.70.0
--------------
* added imb_learn balancing of training set
Expand Down
2 changes: 1 addition & 1 deletion nkululeko/constants.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
VERSION="0.70.0"
VERSION="0.71.0"
SAMPLING_RATE = 16000
8 changes: 8 additions & 0 deletions nkululeko/data/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,10 @@ def load(self):
df["gender"] = df_target["gender"]
if got_age2:
df["age"] = df_target["age"]
# copy other column
for column in df_target.columns:
if column not in [self.target, "age", "speaker", "gender"]:
df[column] = df_target[column]
except audformat.core.errors.BadKeyError:
pass

Expand Down Expand Up @@ -259,6 +263,10 @@ def _get_df_for_lists(self, db, df_files):
is_labeled = True
except (ValueError, audformat.core.errors.BadKeyError) as e:
pass
# copy other column
for column in source_df.columns:
if column not in [self.target, "age", "speaker", "gender"]:
df_local[column] = source_df[column]
df = pd.concat([df, df_local])
return df, is_labeled, got_speaker, got_gender, got_age

Expand Down
59 changes: 42 additions & 17 deletions nkululeko/plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def plot_distributions(self, df, type="samples"):
attributes = ast.literal_eval(
self.util.config_val("EXPL", "value_counts", False)
)
dist_type = self.util.config_val("EXPL", "dist_type", "hist")
dist_type = self.util.config_val("EXPL", "dist_type", "kde")
bin_reals = eval(self.util.config_val("EXPL", "bin_reals", "True"))
for att in attributes:
if len(att) == 1:
Expand All @@ -105,25 +105,50 @@ def plot_distributions(self, df, type="samples"):
self.util.debug(f"plotting {att[0]}")
filename = f"{self.target}-{att[0]}"
if self.util.is_categorical(df[att[0]]):
crosstab = pd.crosstab(index=df["class_label"], columns=df[att[0]])
res_pval = stats.chi2_contingency(crosstab)
res_pval = int(res_pval[1] * 1000) / 1000
caption = f"{type} {df.shape[0]}. P-val chi2: {res_pval}"
ax = (
df.groupby("class_label")[att[0]]
.value_counts()
.unstack()
.plot(kind="bar", stacked=True, title=caption, rot=0)
)
ax.set_ylabel(f"number of {type}")
ax.set_xlabel(self.target)
if self.util.is_categorical(df["class_label"]):
crosstab = pd.crosstab(
index=df["class_label"], columns=df[att[0]]
)
res_pval = stats.chi2_contingency(crosstab)
res_pval = int(res_pval[1] * 1000) / 1000
caption = f"{type} {df.shape[0]}. P-val chi2: {res_pval}"
ax = (
df.groupby("class_label")[att[0]]
.value_counts()
.unstack()
.plot(kind="bar", stacked=True, title=caption, rot=0)
)
ax.set_ylabel(f"number of {type}")
ax.set_xlabel(self.target)
else:
cats, cat_str, es = su.get_effect_size(
df, att[0], "class_label"
)
if dist_type == "hist":
ax = sns.histplot(df, x="class_label", hue=att[0], kde=True)
caption = (
f"{type} {df.shape[0]}. {cat_str} ({cats}):" f" {es}"
)
ax.set_title(caption)
ax.set_xlabel(f"value of {att[0]}")
ax.set_ylabel(f"number of {type}")
else:
ax = sns.displot(
df, x="class_label", hue=att[0], kind="kde", fill=True
)
caption = (
f"{type} {df.shape[0]}. {cat_str} ({cats}):" f" {es}"
)
ax.fig.suptitle(caption)
else:
if self.util.is_categorical(df[self.target]) or bin_reals:
cats, es = su.get_effect_size(df, "class_label", att[0])
cats, cat_str, es = su.get_effect_size(
df, "class_label", att[0]
)
if dist_type == "hist":
ax = sns.histplot(df, x=att[0], hue="class_label", kde=True)
caption = (
f"{type} {df.shape[0]}. Effect size ({cats}):" f" {es}"
f"{type} {df.shape[0]}. {cat_str} ({cats}):" f" {es}"
)
ax.set_title(caption)
ax.set_xlabel(f"value of {att[0]}")
Expand All @@ -137,7 +162,7 @@ def plot_distributions(self, df, type="samples"):
fill=True,
)
caption = (
f"{type} {df.shape[0]}. Effect size ({cats}):" f" {es}"
f"{type} {df.shape[0]}. {cat_str} ({cats}):" f" {es}"
)
ax.fig.suptitle(caption)
else:
Expand All @@ -148,7 +173,7 @@ def plot_distributions(self, df, type="samples"):
caption = f"{type} {df.shape[0]}. {pearson_string}"
ax.set_title(caption)
fig = ax.figure
plt.tight_layout()
# plt.tight_layout()
img_path = f"{fig_dir}{filename}_{type}.{self.format}"
plt.savefig(img_path)
plt.close(fig)
Expand Down
15 changes: 14 additions & 1 deletion nkululeko/utils/stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,4 +71,17 @@ def get_effect_size(df, target, variable):
other = combo[1]
results[f"{one}-{other}"] = cohen_d(cats[one], cats[other])
max_cat = max(results, key=results.get)
return max_cat, results[max_cat]
cat_s = cohens_D_to_string(float(results[max_cat]))
return max_cat, cat_s, results[max_cat]


def cohens_D_to_string(val):
if val < 0.2:
rval = "no effect"
elif val < 0.2:
rval = "small effect"
elif val < 0.5:
rval = "middle effect"
else:
rval = "large effect"
return f"Cohen's d: {rval}"

0 comments on commit 7b9bcb5

Please sign in to comment.