0.71.1

felixbur · Nov 23, 2023 · 8eecac4 · 8eecac4
1 parent a781c19
commit 8eecac4
Show file tree

Hide file tree

Showing 8 changed files with 154 additions and 139 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,13 @@
 Changelog
 =========
 
+Version 0.71.1
+--------------
+* made a wav2vec default 
+* renamed praat features, ommiting spaces
+* fixed plot distribution bugs
+* added feature plots for continuous targets
+
 Version 0.71.0
 --------------
 * added explore visuals. 

diff --git a/nkululeko/constants.py b/nkululeko/constants.py
@@ -1,2 +1,2 @@
-VERSION="0.71.0"
+VERSION="0.71.1"
 SAMPLING_RATE = 16000
diff --git a/nkululeko/experiment.py b/nkululeko/experiment.py
@@ -230,17 +230,18 @@ def fill_train_and_tests(self):
         if self.util.exp_is_classification():
             datatype = self.util.config_val("DATA", "type", "dummy")
             if datatype == "continuous":
-                if self.df_test.is_labeled:
-                    # remember the target in case they get labelencoded later
-                    self.df_test["class_label"] = self.df_test[self.target]
-                    test_cats = self.df_test["class_label"].unique()
-                else:
-                    # if there is no target, copy a dummy label
-                    self.df_test = self._add_random_target(self.df_test)
-                if self.df_train.is_labeled:
-                    # remember the target in case they get labelencoded later
-                    self.df_train["class_label"] = self.df_train[self.target]
-                    train_cats = self.df_train["class_label"].unique()
+                # if self.df_test.is_labeled:
+                #     # remember the target in case they get labelencoded later
+                #     self.df_test["class_label"] = self.df_test[self.target]
+                test_cats = self.df_test["class_label"].unique()
+                # else:
+                #     # if there is no target, copy a dummy label
+                #     self.df_test = self._add_random_target(self.df_test)
+                # if self.df_train.is_labeled:
+                #     # remember the target in case they get labelencoded later
+                #     self.df_train["class_label"] = self.df_train[self.target]
+                train_cats = self.df_train["class_label"].unique()
+
             else:
                 if self.df_test.is_labeled:
                     test_cats = self.df_test[self.target].unique()

diff --git a/nkululeko/feat_extract/feats_analyser.py b/nkululeko/feat_extract/feats_analyser.py
@@ -12,10 +12,10 @@
 class FeatureAnalyser:
     def __init__(self, label, df_labels, df_features):
         self.util = Util("feats_analyser")
-        target = self.util.config_val("DATA", "target", "emotion")
-        self.y = df_labels[target]
+        self.target = self.util.config_val("DATA", "target", "emotion")
+        self.labels = df_labels[self.target]
         self.df_labels = df_labels
-        self.X = df_features
+        self.features = df_features
         self.label = label
 
     def analyse(self):
@@ -26,44 +26,40 @@ def analyse(self):
         if self.util.exp_is_classification():
             if model_s == "log_reg":
                 model = LogisticRegression()
-                model.fit(self.X, self.y)
+                model.fit(self.features, self.labels)
                 importance = model.coef_[0]
             elif model_s == "tree":
                 model = DecisionTreeClassifier()
-                model.fit(self.X, self.y)
+                model.fit(self.features, self.labels)
                 importance = model.feature_importances_
-                plot_tree = eval(
-                    self.util.config_val("EXPL", "plot_tree", "False")
-                )
+                plot_tree = eval(self.util.config_val("EXPL", "plot_tree", "False"))
                 if plot_tree:
                     plots = Plots()
-                    plots.plot_tree(model, self.X)
+                    plots.plot_tree(model, self.features)
             else:
                 self.util.error(f"invalid analysis method: {model}")
         else:  # regression experiment
             if model_s == "lin_reg":
                 model = LinearRegression()
-                model.fit(self.X, self.y)
+                model.fit(self.features, self.labels)
                 importance = model.coef_
             elif model_s == "tree":
                 model = DecisionTreeRegressor()
-                model.fit(self.X, self.y)
+                model.fit(self.features, self.labels)
                 importance = model.feature_importances_
             else:
                 self.util.error(f"invalid analysis method: {model_s}")
 
         df_imp = pd.DataFrame(
-            {"feats": self.X.columns, "importance": importance}
+            {"feats": self.features.columns, "importance": importance}
         )
         df_imp = df_imp.sort_values(by="importance", ascending=False).iloc[
             :max_feat_num
         ]
         ax = df_imp.plot(x="feats", y="importance", kind="bar")
         ax.set(title=f"{self.label} samples")
         plt.tight_layout()
-        fig_dir = (
-            self.util.get_path("fig_dir") + "../"
-        )  # one up because of the runs
+        fig_dir = self.util.get_path("fig_dir") + "../"  # one up because of the runs
         exp_name = self.util.get_exp_name(only_data=True)
         format = self.util.config_val("PLOT", "format", "png")
         filename = f"{fig_dir}{exp_name}EXPL_{model_s}.{format}"
@@ -73,36 +69,28 @@ def analyse(self):
         plt.close(fig)
         # result file
         res_dir = self.util.get_path("res_dir")
-        file_name = f"{res_dir}{self.util.get_exp_name(only_data=True)}EXPL_{model_s}.txt"
+        file_name = (
+            f"{res_dir}{self.util.get_exp_name(only_data=True)}EXPL_{model_s}.txt"
+        )
         with open(file_name, "w") as text_file:
             text_file.write(
                 "features in order of decreasing importance according to model"
-                f" {model_s}:\n"
-                + f"{str(df_imp.feats.values)}\n"
+                f" {model_s}:\n" + f"{str(df_imp.feats.values)}\n"
             )
 
         df_imp.to_csv(file_name, mode="a")
 
         # check if feature distributions should be plotted
-        plot_feats = self.util.config_val(
-            "EXPL", "feature_distributions", False
-        )
+        plot_feats = self.util.config_val("EXPL", "feature_distributions", False)
         if plot_feats:
-            sample_selection = self.util.config_val(
-                "EXPL", "sample_selection", "all"
-            )
-            if self.util.exp_is_classification():
-                for feature in df_imp.feats:
-                    # plot_feature(self, title, feature, label, df_labels, df_features):
-                    _plots = Plots()
-                    _plots.plot_feature(
-                        sample_selection,
-                        feature,
-                        "class_label",
-                        self.df_labels,
-                        self.X,
-                    )
-            else:
-                self.util.debug(
-                    "can't plot feature distributions if not classification"
+            sample_selection = self.util.config_val("EXPL", "sample_selection", "all")
+            for feature in df_imp.feats:
+                # plot_feature(self, title, feature, label, df_labels, df_features):
+                _plots = Plots()
+                _plots.plot_feature(
+                    sample_selection,
+                    feature,
+                    self.target,
+                    self.df_labels,
+                    self.features,
                 )
diff --git a/nkululeko/feat_extract/feats_wav2vec2.py b/nkululeko/feat_extract/feats_wav2vec2.py
@@ -19,7 +19,10 @@ def __init__(self, name, data_df, feat_type):
         cuda = "cuda" if torch.cuda.is_available() else "cpu"
         self.device = self.util.config_val("MODEL", "device", cuda)
         self.model_initialized = False
-        self.feat_type = feat_type
+        if feat_type == "wav2vec":
+            self.feat_type = "wav2vec2-large-robust-ft-swbd-300h"
+        else:
+            self.feat_type = feat_type
 
     def init_model(self):
         # load model
@@ -37,9 +40,7 @@ def extract(self):
         """Extract the features or load them from disk if present."""
         store = self.util.get_path("store")
         storage = f"{store}{self.name}.pkl"
-        extract = self.util.config_val(
-            "FEATS", "needs_feature_extraction", False
-        )
+        extract = self.util.config_val("FEATS", "needs_feature_extraction", False)
         no_reuse = eval(self.util.config_val("FEATS", "no_reuse", "False"))
         if extract or no_reuse or not os.path.isfile(storage):
             if not self.model_initialized:
@@ -57,15 +58,11 @@ def extract(self):
                     frame_offset=int(start.total_seconds() * 16000),
                     num_frames=int((end - start).total_seconds() * 16000),
                 )
-                assert (
-                    sampling_rate == 16000
-                ), f"got {sampling_rate} instead of 16000"
+                assert sampling_rate == 16000, f"got {sampling_rate} instead of 16000"
                 emb = self.get_embeddings(signal, sampling_rate, file)
                 emb_series[idx] = emb
             # print(f"emb_series shape: {emb_series.shape}")
-            self.df = pd.DataFrame(
-                emb_series.values.tolist(), index=self.data_df.index
-            )
+            self.df = pd.DataFrame(emb_series.values.tolist(), index=self.data_df.index)
             # print(f"df shape: {self.df.shape}")
             self.df.to_pickle(storage)
             try:

diff --git a/nkululeko/feat_extract/feinberg_praat.py b/nkululeko/feat_extract/feinberg_praat.py
@@ -466,11 +466,11 @@ def get_speech_rate(file_index):
     cols = [
         "nsyll",
         "npause",
-        "dur(s)",
-        "phonationtime(s)",
-        "speechrate(nsyll / dur)",
-        "articulation rate(nsyll / phonationtime)",
-        "ASD(speakingtime / nsyll)",
+        "dur_s",
+        "phonationtime_s",
+        "speechrate_nsyll_dur",
+        "articulation_rate_nsyll_phonationtime",
+        "ASD_speakingtime_nsyll",
     ]
     datalist = []
     for idx, (wave_file, start, end) in enumerate(tqdm(file_index.to_list())):
@@ -621,10 +621,10 @@ def speech_rate(sound):
     speechrate_dictionary = {
         "nsyll": voicedcount,
         "npause": npause,
-        "dur(s)": originaldur,
-        "phonationtime(s)": intensity_duration,
-        "speechrate(nsyll / dur)": speakingrate,
-        "articulation rate(nsyll / phonationtime)": articulationrate,
-        "ASD(speakingtime / nsyll)": asd,
+        "dur_s": originaldur,
+        "phonationtime_s": intensity_duration,
+        "speechrate_nsyll_dur": speakingrate,
+        "articulation_rate_nsyll_phonationtime": articulationrate,
+        "ASD_speakingtime_nsyll": asd,
     }
     return speechrate_dictionary
diff --git a/nkululeko/feature_extractor.py b/nkululeko/feature_extractor.py
@@ -58,7 +58,7 @@ def extract(self):
                 self.featExtractor = TRILLset(
                     f"{store_name}_{self.feats_designation}", self.data_df
                 )
-            elif feats_type.startswith("wav2vec2"):
+            elif feats_type.startswith("wav2vec"):
                 from nkululeko.feat_extract.feats_wav2vec2 import Wav2vec2
 
                 self.featExtractor = Wav2vec2(