0.71.0

felixbur · Nov 22, 2023 · 7b9bcb5 · 7b9bcb5
1 parent b9f0627
commit 7b9bcb5
Show file tree

Hide file tree

Showing 5 changed files with 70 additions and 19 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,11 @@
 Changelog
 =========
 
+Version 0.71.0
+--------------
+* added explore visuals. 
+* all columns from databases should now be usable
+
 Version 0.70.0
 --------------
 * added imb_learn balancing of training set

diff --git a/nkululeko/constants.py b/nkululeko/constants.py
@@ -1,2 +1,2 @@
-VERSION="0.70.0"
+VERSION="0.71.0"
 SAMPLING_RATE = 16000
diff --git a/nkululeko/data/dataset.py b/nkululeko/data/dataset.py
@@ -157,6 +157,10 @@ def load(self):
                     df["gender"] = df_target["gender"]
                 if got_age2:
                     df["age"] = df_target["age"]
+                # copy other column
+                for column in df_target.columns:
+                    if column not in [self.target, "age", "speaker", "gender"]:
+                        df[column] = df_target[column]
             except audformat.core.errors.BadKeyError:
                 pass
 
@@ -259,6 +263,10 @@ def _get_df_for_lists(self, db, df_files):
                 is_labeled = True
             except (ValueError, audformat.core.errors.BadKeyError) as e:
                 pass
+            # copy other column
+            for column in source_df.columns:
+                if column not in [self.target, "age", "speaker", "gender"]:
+                    df_local[column] = source_df[column]
             df = pd.concat([df, df_local])
         return df, is_labeled, got_speaker, got_gender, got_age
 

diff --git a/nkululeko/plots.py b/nkululeko/plots.py
@@ -95,7 +95,7 @@ def plot_distributions(self, df, type="samples"):
         attributes = ast.literal_eval(
             self.util.config_val("EXPL", "value_counts", False)
         )
-        dist_type = self.util.config_val("EXPL", "dist_type", "hist")
+        dist_type = self.util.config_val("EXPL", "dist_type", "kde")
         bin_reals = eval(self.util.config_val("EXPL", "bin_reals", "True"))
         for att in attributes:
             if len(att) == 1:
@@ -105,25 +105,50 @@ def plot_distributions(self, df, type="samples"):
                 self.util.debug(f"plotting {att[0]}")
                 filename = f"{self.target}-{att[0]}"
                 if self.util.is_categorical(df[att[0]]):
-                    crosstab = pd.crosstab(index=df["class_label"], columns=df[att[0]])
-                    res_pval = stats.chi2_contingency(crosstab)
-                    res_pval = int(res_pval[1] * 1000) / 1000
-                    caption = f"{type} {df.shape[0]}. P-val chi2: {res_pval}"
-                    ax = (
-                        df.groupby("class_label")[att[0]]
-                        .value_counts()
-                        .unstack()
-                        .plot(kind="bar", stacked=True, title=caption, rot=0)
-                    )
-                    ax.set_ylabel(f"number of {type}")
-                    ax.set_xlabel(self.target)
+                    if self.util.is_categorical(df["class_label"]):
+                        crosstab = pd.crosstab(
+                            index=df["class_label"], columns=df[att[0]]
+                        )
+                        res_pval = stats.chi2_contingency(crosstab)
+                        res_pval = int(res_pval[1] * 1000) / 1000
+                        caption = f"{type} {df.shape[0]}. P-val chi2: {res_pval}"
+                        ax = (
+                            df.groupby("class_label")[att[0]]
+                            .value_counts()
+                            .unstack()
+                            .plot(kind="bar", stacked=True, title=caption, rot=0)
+                        )
+                        ax.set_ylabel(f"number of {type}")
+                        ax.set_xlabel(self.target)
+                    else:
+                        cats, cat_str, es = su.get_effect_size(
+                            df, att[0], "class_label"
+                        )
+                        if dist_type == "hist":
+                            ax = sns.histplot(df, x="class_label", hue=att[0], kde=True)
+                            caption = (
+                                f"{type} {df.shape[0]}. {cat_str} ({cats}):" f" {es}"
+                            )
+                            ax.set_title(caption)
+                            ax.set_xlabel(f"value of {att[0]}")
+                            ax.set_ylabel(f"number of {type}")
+                        else:
+                            ax = sns.displot(
+                                df, x="class_label", hue=att[0], kind="kde", fill=True
+                            )
+                            caption = (
+                                f"{type} {df.shape[0]}. {cat_str} ({cats}):" f" {es}"
+                            )
+                            ax.fig.suptitle(caption)
                 else:
                     if self.util.is_categorical(df[self.target]) or bin_reals:
-                        cats, es = su.get_effect_size(df, "class_label", att[0])
+                        cats, cat_str, es = su.get_effect_size(
+                            df, "class_label", att[0]
+                        )
                         if dist_type == "hist":
                             ax = sns.histplot(df, x=att[0], hue="class_label", kde=True)
                             caption = (
-                                f"{type} {df.shape[0]}. Effect size ({cats}):" f" {es}"
+                                f"{type} {df.shape[0]}. {cat_str} ({cats}):" f" {es}"
                             )
                             ax.set_title(caption)
                             ax.set_xlabel(f"value of {att[0]}")
@@ -137,7 +162,7 @@ def plot_distributions(self, df, type="samples"):
                                 fill=True,
                             )
                             caption = (
-                                f"{type} {df.shape[0]}. Effect size ({cats}):" f" {es}"
+                                f"{type} {df.shape[0]}. {cat_str} ({cats}):" f" {es}"
                             )
                             ax.fig.suptitle(caption)
                     else:
@@ -148,7 +173,7 @@ def plot_distributions(self, df, type="samples"):
                         caption = f"{type} {df.shape[0]}. {pearson_string}"
                         ax.set_title(caption)
                 fig = ax.figure
-                plt.tight_layout()
+                # plt.tight_layout()
                 img_path = f"{fig_dir}{filename}_{type}.{self.format}"
                 plt.savefig(img_path)
                 plt.close(fig)

diff --git a/nkululeko/utils/stats.py b/nkululeko/utils/stats.py
@@ -71,4 +71,17 @@ def get_effect_size(df, target, variable):
         other = combo[1]
         results[f"{one}-{other}"] = cohen_d(cats[one], cats[other])
     max_cat = max(results, key=results.get)
-    return max_cat, results[max_cat]
+    cat_s = cohens_D_to_string(float(results[max_cat]))
+    return max_cat, cat_s, results[max_cat]
+
+
+def cohens_D_to_string(val):
+    if val < 0.2:
+        rval = "no effect"
+    elif val < 0.2:
+        rval = "small effect"
+    elif val < 0.5:
+        rval = "middle effect"
+    else:
+        rval = "large effect"
+    return f"Cohen's d: {rval}"