diff --git a/CHANGELOG.md b/CHANGELOG.md index 47a5b2b6..1f87d514 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,13 @@ Changelog ========= +Version 0.71.1 +-------------- +* made a wav2vec default +* renamed praat features, ommiting spaces +* fixed plot distribution bugs +* added feature plots for continuous targets + Version 0.71.0 -------------- * added explore visuals. diff --git a/nkululeko/constants.py b/nkululeko/constants.py index b8e9aa83..884af734 100644 --- a/nkululeko/constants.py +++ b/nkululeko/constants.py @@ -1,2 +1,2 @@ -VERSION="0.71.0" +VERSION="0.71.1" SAMPLING_RATE = 16000 diff --git a/nkululeko/experiment.py b/nkululeko/experiment.py index 660f7c62..25aace84 100644 --- a/nkululeko/experiment.py +++ b/nkululeko/experiment.py @@ -230,17 +230,18 @@ def fill_train_and_tests(self): if self.util.exp_is_classification(): datatype = self.util.config_val("DATA", "type", "dummy") if datatype == "continuous": - if self.df_test.is_labeled: - # remember the target in case they get labelencoded later - self.df_test["class_label"] = self.df_test[self.target] - test_cats = self.df_test["class_label"].unique() - else: - # if there is no target, copy a dummy label - self.df_test = self._add_random_target(self.df_test) - if self.df_train.is_labeled: - # remember the target in case they get labelencoded later - self.df_train["class_label"] = self.df_train[self.target] - train_cats = self.df_train["class_label"].unique() + # if self.df_test.is_labeled: + # # remember the target in case they get labelencoded later + # self.df_test["class_label"] = self.df_test[self.target] + test_cats = self.df_test["class_label"].unique() + # else: + # # if there is no target, copy a dummy label + # self.df_test = self._add_random_target(self.df_test) + # if self.df_train.is_labeled: + # # remember the target in case they get labelencoded later + # self.df_train["class_label"] = self.df_train[self.target] + train_cats = self.df_train["class_label"].unique() + else: if self.df_test.is_labeled: test_cats = self.df_test[self.target].unique() diff --git a/nkululeko/feat_extract/feats_analyser.py b/nkululeko/feat_extract/feats_analyser.py index d4c34eba..12840fd1 100644 --- a/nkululeko/feat_extract/feats_analyser.py +++ b/nkululeko/feat_extract/feats_analyser.py @@ -12,10 +12,10 @@ class FeatureAnalyser: def __init__(self, label, df_labels, df_features): self.util = Util("feats_analyser") - target = self.util.config_val("DATA", "target", "emotion") - self.y = df_labels[target] + self.target = self.util.config_val("DATA", "target", "emotion") + self.labels = df_labels[self.target] self.df_labels = df_labels - self.X = df_features + self.features = df_features self.label = label def analyse(self): @@ -26,34 +26,32 @@ def analyse(self): if self.util.exp_is_classification(): if model_s == "log_reg": model = LogisticRegression() - model.fit(self.X, self.y) + model.fit(self.features, self.labels) importance = model.coef_[0] elif model_s == "tree": model = DecisionTreeClassifier() - model.fit(self.X, self.y) + model.fit(self.features, self.labels) importance = model.feature_importances_ - plot_tree = eval( - self.util.config_val("EXPL", "plot_tree", "False") - ) + plot_tree = eval(self.util.config_val("EXPL", "plot_tree", "False")) if plot_tree: plots = Plots() - plots.plot_tree(model, self.X) + plots.plot_tree(model, self.features) else: self.util.error(f"invalid analysis method: {model}") else: # regression experiment if model_s == "lin_reg": model = LinearRegression() - model.fit(self.X, self.y) + model.fit(self.features, self.labels) importance = model.coef_ elif model_s == "tree": model = DecisionTreeRegressor() - model.fit(self.X, self.y) + model.fit(self.features, self.labels) importance = model.feature_importances_ else: self.util.error(f"invalid analysis method: {model_s}") df_imp = pd.DataFrame( - {"feats": self.X.columns, "importance": importance} + {"feats": self.features.columns, "importance": importance} ) df_imp = df_imp.sort_values(by="importance", ascending=False).iloc[ :max_feat_num @@ -61,9 +59,7 @@ def analyse(self): ax = df_imp.plot(x="feats", y="importance", kind="bar") ax.set(title=f"{self.label} samples") plt.tight_layout() - fig_dir = ( - self.util.get_path("fig_dir") + "../" - ) # one up because of the runs + fig_dir = self.util.get_path("fig_dir") + "../" # one up because of the runs exp_name = self.util.get_exp_name(only_data=True) format = self.util.config_val("PLOT", "format", "png") filename = f"{fig_dir}{exp_name}EXPL_{model_s}.{format}" @@ -73,36 +69,28 @@ def analyse(self): plt.close(fig) # result file res_dir = self.util.get_path("res_dir") - file_name = f"{res_dir}{self.util.get_exp_name(only_data=True)}EXPL_{model_s}.txt" + file_name = ( + f"{res_dir}{self.util.get_exp_name(only_data=True)}EXPL_{model_s}.txt" + ) with open(file_name, "w") as text_file: text_file.write( "features in order of decreasing importance according to model" - f" {model_s}:\n" - + f"{str(df_imp.feats.values)}\n" + f" {model_s}:\n" + f"{str(df_imp.feats.values)}\n" ) df_imp.to_csv(file_name, mode="a") # check if feature distributions should be plotted - plot_feats = self.util.config_val( - "EXPL", "feature_distributions", False - ) + plot_feats = self.util.config_val("EXPL", "feature_distributions", False) if plot_feats: - sample_selection = self.util.config_val( - "EXPL", "sample_selection", "all" - ) - if self.util.exp_is_classification(): - for feature in df_imp.feats: - # plot_feature(self, title, feature, label, df_labels, df_features): - _plots = Plots() - _plots.plot_feature( - sample_selection, - feature, - "class_label", - self.df_labels, - self.X, - ) - else: - self.util.debug( - "can't plot feature distributions if not classification" + sample_selection = self.util.config_val("EXPL", "sample_selection", "all") + for feature in df_imp.feats: + # plot_feature(self, title, feature, label, df_labels, df_features): + _plots = Plots() + _plots.plot_feature( + sample_selection, + feature, + self.target, + self.df_labels, + self.features, ) diff --git a/nkululeko/feat_extract/feats_wav2vec2.py b/nkululeko/feat_extract/feats_wav2vec2.py index 0c6fea7a..7875325e 100644 --- a/nkululeko/feat_extract/feats_wav2vec2.py +++ b/nkululeko/feat_extract/feats_wav2vec2.py @@ -19,7 +19,10 @@ def __init__(self, name, data_df, feat_type): cuda = "cuda" if torch.cuda.is_available() else "cpu" self.device = self.util.config_val("MODEL", "device", cuda) self.model_initialized = False - self.feat_type = feat_type + if feat_type == "wav2vec": + self.feat_type = "wav2vec2-large-robust-ft-swbd-300h" + else: + self.feat_type = feat_type def init_model(self): # load model @@ -37,9 +40,7 @@ def extract(self): """Extract the features or load them from disk if present.""" store = self.util.get_path("store") storage = f"{store}{self.name}.pkl" - extract = self.util.config_val( - "FEATS", "needs_feature_extraction", False - ) + extract = self.util.config_val("FEATS", "needs_feature_extraction", False) no_reuse = eval(self.util.config_val("FEATS", "no_reuse", "False")) if extract or no_reuse or not os.path.isfile(storage): if not self.model_initialized: @@ -57,15 +58,11 @@ def extract(self): frame_offset=int(start.total_seconds() * 16000), num_frames=int((end - start).total_seconds() * 16000), ) - assert ( - sampling_rate == 16000 - ), f"got {sampling_rate} instead of 16000" + assert sampling_rate == 16000, f"got {sampling_rate} instead of 16000" emb = self.get_embeddings(signal, sampling_rate, file) emb_series[idx] = emb # print(f"emb_series shape: {emb_series.shape}") - self.df = pd.DataFrame( - emb_series.values.tolist(), index=self.data_df.index - ) + self.df = pd.DataFrame(emb_series.values.tolist(), index=self.data_df.index) # print(f"df shape: {self.df.shape}") self.df.to_pickle(storage) try: diff --git a/nkululeko/feat_extract/feinberg_praat.py b/nkululeko/feat_extract/feinberg_praat.py index 74a21793..20ee3528 100644 --- a/nkululeko/feat_extract/feinberg_praat.py +++ b/nkululeko/feat_extract/feinberg_praat.py @@ -466,11 +466,11 @@ def get_speech_rate(file_index): cols = [ "nsyll", "npause", - "dur(s)", - "phonationtime(s)", - "speechrate(nsyll / dur)", - "articulation rate(nsyll / phonationtime)", - "ASD(speakingtime / nsyll)", + "dur_s", + "phonationtime_s", + "speechrate_nsyll_dur", + "articulation_rate_nsyll_phonationtime", + "ASD_speakingtime_nsyll", ] datalist = [] for idx, (wave_file, start, end) in enumerate(tqdm(file_index.to_list())): @@ -621,10 +621,10 @@ def speech_rate(sound): speechrate_dictionary = { "nsyll": voicedcount, "npause": npause, - "dur(s)": originaldur, - "phonationtime(s)": intensity_duration, - "speechrate(nsyll / dur)": speakingrate, - "articulation rate(nsyll / phonationtime)": articulationrate, - "ASD(speakingtime / nsyll)": asd, + "dur_s": originaldur, + "phonationtime_s": intensity_duration, + "speechrate_nsyll_dur": speakingrate, + "articulation_rate_nsyll_phonationtime": articulationrate, + "ASD_speakingtime_nsyll": asd, } return speechrate_dictionary diff --git a/nkululeko/feature_extractor.py b/nkululeko/feature_extractor.py index 9c633403..8a404a78 100644 --- a/nkululeko/feature_extractor.py +++ b/nkululeko/feature_extractor.py @@ -58,7 +58,7 @@ def extract(self): self.featExtractor = TRILLset( f"{store_name}_{self.feats_designation}", self.data_df ) - elif feats_type.startswith("wav2vec2"): + elif feats_type.startswith("wav2vec"): from nkululeko.feat_extract.feats_wav2vec2 import Wav2vec2 self.featExtractor = Wav2vec2( diff --git a/nkululeko/plots.py b/nkululeko/plots.py index 7822f763..2ed24ac0 100644 --- a/nkululeko/plots.py +++ b/nkululeko/plots.py @@ -88,93 +88,57 @@ def plot_distributions_speaker(self, df): img_path, ) ) - self.plot_distributions(df_speakers, type="speakers") + self.plot_distributions(df_speakers, type_s="speakers") - def plot_distributions(self, df, type="samples"): + def plot_distributions(self, df, type_s="samples"): fig_dir = self.util.get_path("fig_dir") + "../" # one up because of the runs attributes = ast.literal_eval( self.util.config_val("EXPL", "value_counts", False) ) - dist_type = self.util.config_val("EXPL", "dist_type", "kde") - bin_reals = eval(self.util.config_val("EXPL", "bin_reals", "True")) + bin_reals = eval(self.util.config_val("EXPL", "bin_reals", "False")) for att in attributes: if len(att) == 1: - caption, title = "", "" if att[0] not in df: self.util.error(f"unknown feature: {att[0]}") self.util.debug(f"plotting {att[0]}") filename = f"{self.target}-{att[0]}" if self.util.is_categorical(df[att[0]]): if self.util.is_categorical(df["class_label"]): - crosstab = pd.crosstab( - index=df["class_label"], columns=df[att[0]] - ) - res_pval = stats.chi2_contingency(crosstab) - res_pval = int(res_pval[1] * 1000) / 1000 - caption = f"{type} {df.shape[0]}. P-val chi2: {res_pval}" - ax = ( - df.groupby("class_label")[att[0]] - .value_counts() - .unstack() - .plot(kind="bar", stacked=True, title=caption, rot=0) + ax, caption = self._plot2cat( + df, "class_label", att[0], self.target, type_s ) - ax.set_ylabel(f"number of {type}") - ax.set_xlabel(self.target) else: - cats, cat_str, es = su.get_effect_size( - df, att[0], "class_label" + ax, caption = self._plotcatcont( + df, "class_label", att[0], self.target, type_s ) - if dist_type == "hist": - ax = sns.histplot(df, x="class_label", hue=att[0], kde=True) - caption = ( - f"{type} {df.shape[0]}. {cat_str} ({cats}):" f" {es}" - ) - ax.set_title(caption) - ax.set_xlabel(f"value of {att[0]}") - ax.set_ylabel(f"number of {type}") - else: - ax = sns.displot( - df, x="class_label", hue=att[0], kind="kde", fill=True - ) - caption = ( - f"{type} {df.shape[0]}. {cat_str} ({cats}):" f" {es}" - ) - ax.fig.suptitle(caption) else: - if self.util.is_categorical(df[self.target]) or bin_reals: - cats, cat_str, es = su.get_effect_size( - df, "class_label", att[0] - ) - if dist_type == "hist": - ax = sns.histplot(df, x=att[0], hue="class_label", kde=True) - caption = ( - f"{type} {df.shape[0]}. {cat_str} ({cats}):" f" {es}" + if self.util.is_categorical(df["class_label"]) or bin_reals: + if bin_reals: + self.util.debug( + f"{self.name}: binning continuous variable to categories" ) - ax.set_title(caption) - ax.set_xlabel(f"value of {att[0]}") - ax.set_ylabel(f"number of {type}") - else: - ax = sns.displot( + cat_vals = self.util.continuous_to_categorical( + df[self.target] + ) + df[f"{self.target}_binned"] = cat_vals + ax, caption = self._plotcatcont( df, - x=att[0], - hue="class_label", - kind="kde", - fill=True, + f"{self.target}_binned", + att[0], + self.target, + type_s, ) - caption = ( - f"{type} {df.shape[0]}. {cat_str} ({cats}):" f" {es}" + else: + ax, caption = self._plotcatcont( + df, att[0], "class_label", att[0], type_s ) - ax.fig.suptitle(caption) else: - pearson = stats.pearsonr(df[self.target], df[att[0]]) - pearson = int(pearson[0] * 1000) / 1000 - pearson_string = f"PCC: {pearson}" - ax = sns.scatterplot(data=df, x=self.target, y=att[0]) - caption = f"{type} {df.shape[0]}. {pearson_string}" - ax.set_title(caption) + ax, caption = self._plot2cont( + df, self.target, att[0], self.target, type_s + ) fig = ax.figure # plt.tight_layout() - img_path = f"{fig_dir}{filename}_{type}.{self.format}" + img_path = f"{fig_dir}{filename}_{type_s}.{self.format}" plt.savefig(img_path) plt.close(fig) glob_conf.report.add_item( @@ -208,7 +172,7 @@ def plot_distributions(self, df, type="samples"): pearson_string = f"PCC: {pearson}" ax = sns.scatterplot(data=df, x=att1, y=att2, hue="class_label") fig = ax.figure - ax.set_title(f"{type} {df.shape[0]}. {pearson_string}") + ax.set_title(f"{type_s} {df.shape[0]}. {pearson_string}") plt.tight_layout() plt.savefig(f"{fig_dir}{filename}_{type}.{self.format}") plt.close(fig) @@ -219,6 +183,56 @@ def plot_distributions(self, df, type="samples"): f" {att} has more than 2 values" ) + def _plot2cont(self, df, col1, col2, xlab, ylab): + """ + plot relation of two continuous distributions + """ + pearson = stats.pearsonr(df[col1], df[col2]) + # trunc to three digits + pearson = int(pearson[0] * 1000) / 1000 + pearson_string = f"PCC: {pearson}" + ax = sns.scatterplot(data=df, x=col1, y=col2) + caption = f"{ylab} {df.shape[0]}. {pearson_string}" + ax.set_title(caption) + return ax, caption + + def _plotcatcont(self, df, cat_col, cont_col, xlab, ylab): + """ + plot relation of categorical distribution with continuous + """ + dist_type = self.util.config_val("EXPL", "dist_type", "kde") + cats, cat_str, es = su.get_effect_size(df, cont_col, cat_col) + if dist_type == "hist": + ax = sns.histplot(df, x=cat_col, hue=cont_col, kde=True) + caption = f"{ylab} {df.shape[0]}. {cat_str} ({cats}):" f" {es}" + ax.set_title(caption) + ax.set_xlabel(f"{xlab}") + ax.set_ylabel(f"number of {ylab}") + else: + ax = sns.displot(df, x=cat_col, hue=cont_col, kind="kde", fill=True) + ax.set(xlabel=f"{xlab}") + caption = f"{ylab} {df.shape[0]}. {cat_str} ({cats}):" f" {es}" + ax.fig.suptitle(caption) + return ax, caption + + def _plot2cat(self, df, col1, col2, xlab, ylab): + """ + plot relation of 2 categorical distributions + """ + crosstab = pd.crosstab(index=df[col1], columns=df[col2]) + res_pval = stats.chi2_contingency(crosstab) + res_pval = int(res_pval[1] * 1000) / 1000 + caption = f"{ylab} {df.shape[0]}. P-val chi2: {res_pval}" + ax = ( + df.groupby(col1)[col2] + .value_counts() + .unstack() + .plot(kind="bar", stacked=True, title=caption, rot=0) + ) + ax.set_ylabel(f"number of {ylab}") + ax.set_xlabel(xlab) + return ax, caption + def plot_durations(self, df, filename, sample_selection, caption=""): fig_dir = self.util.get_path("fig_dir") + "../" # one up because of the runs try: @@ -369,10 +383,18 @@ def getTsne(self, feats, perplexity=30, learning_rate=200): def plot_feature(self, title, feature, label, df_labels, df_features): fig_dir = self.util.get_path("fig_dir") + "../" # one up because of the runs filename = f"{fig_dir}feat_dist_{title}_{feature}.{self.format}" - df_plot = pd.DataFrame({label: df_labels[label], feature: df_features[feature]}) - ax = sns.violinplot(data=df_plot, x=label, y=feature) - label = self.util.config_val("DATA", "target", "class_label") - ax.set(title=f"{title} samples", xlabel=label) + if self.util.is_categorical(df_labels[label]): + df_plot = pd.DataFrame( + {label: df_labels[label], feature: df_features[feature]} + ) + ax = sns.violinplot(data=df_plot, x=label, y=feature) + label = self.util.config_val("DATA", "target", "class_label") + ax.set(title=f"{title} samples", xlabel=label) + else: + plot_df = pd.concat([df_labels, df_features], axis=1) + ax, caption = self._plot2cont(plot_df, label, feature, label, feature) + # def _plot2cont(self, df, col1, col2, xlab, ylab): + fig = ax.figure plt.tight_layout() plt.savefig(filename)