Skip to content

Commit

Permalink
0.71.1
Browse files Browse the repository at this point in the history
  • Loading branch information
FBurkhardt committed Nov 23, 2023
1 parent a781c19 commit 8eecac4
Show file tree
Hide file tree
Showing 8 changed files with 154 additions and 139 deletions.
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
Changelog
=========

Version 0.71.1
--------------
* made a wav2vec default
* renamed praat features, ommiting spaces
* fixed plot distribution bugs
* added feature plots for continuous targets

Version 0.71.0
--------------
* added explore visuals.
Expand Down
2 changes: 1 addition & 1 deletion nkululeko/constants.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
VERSION="0.71.0"
VERSION="0.71.1"
SAMPLING_RATE = 16000
23 changes: 12 additions & 11 deletions nkululeko/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,17 +230,18 @@ def fill_train_and_tests(self):
if self.util.exp_is_classification():
datatype = self.util.config_val("DATA", "type", "dummy")
if datatype == "continuous":
if self.df_test.is_labeled:
# remember the target in case they get labelencoded later
self.df_test["class_label"] = self.df_test[self.target]
test_cats = self.df_test["class_label"].unique()
else:
# if there is no target, copy a dummy label
self.df_test = self._add_random_target(self.df_test)
if self.df_train.is_labeled:
# remember the target in case they get labelencoded later
self.df_train["class_label"] = self.df_train[self.target]
train_cats = self.df_train["class_label"].unique()
# if self.df_test.is_labeled:
# # remember the target in case they get labelencoded later
# self.df_test["class_label"] = self.df_test[self.target]
test_cats = self.df_test["class_label"].unique()
# else:
# # if there is no target, copy a dummy label
# self.df_test = self._add_random_target(self.df_test)
# if self.df_train.is_labeled:
# # remember the target in case they get labelencoded later
# self.df_train["class_label"] = self.df_train[self.target]
train_cats = self.df_train["class_label"].unique()

else:
if self.df_test.is_labeled:
test_cats = self.df_test[self.target].unique()
Expand Down
64 changes: 26 additions & 38 deletions nkululeko/feat_extract/feats_analyser.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,10 @@
class FeatureAnalyser:
def __init__(self, label, df_labels, df_features):
self.util = Util("feats_analyser")
target = self.util.config_val("DATA", "target", "emotion")
self.y = df_labels[target]
self.target = self.util.config_val("DATA", "target", "emotion")
self.labels = df_labels[self.target]
self.df_labels = df_labels
self.X = df_features
self.features = df_features
self.label = label

def analyse(self):
Expand All @@ -26,44 +26,40 @@ def analyse(self):
if self.util.exp_is_classification():
if model_s == "log_reg":
model = LogisticRegression()
model.fit(self.X, self.y)
model.fit(self.features, self.labels)
importance = model.coef_[0]
elif model_s == "tree":
model = DecisionTreeClassifier()
model.fit(self.X, self.y)
model.fit(self.features, self.labels)
importance = model.feature_importances_
plot_tree = eval(
self.util.config_val("EXPL", "plot_tree", "False")
)
plot_tree = eval(self.util.config_val("EXPL", "plot_tree", "False"))
if plot_tree:
plots = Plots()
plots.plot_tree(model, self.X)
plots.plot_tree(model, self.features)
else:
self.util.error(f"invalid analysis method: {model}")
else: # regression experiment
if model_s == "lin_reg":
model = LinearRegression()
model.fit(self.X, self.y)
model.fit(self.features, self.labels)
importance = model.coef_
elif model_s == "tree":
model = DecisionTreeRegressor()
model.fit(self.X, self.y)
model.fit(self.features, self.labels)
importance = model.feature_importances_
else:
self.util.error(f"invalid analysis method: {model_s}")

df_imp = pd.DataFrame(
{"feats": self.X.columns, "importance": importance}
{"feats": self.features.columns, "importance": importance}
)
df_imp = df_imp.sort_values(by="importance", ascending=False).iloc[
:max_feat_num
]
ax = df_imp.plot(x="feats", y="importance", kind="bar")
ax.set(title=f"{self.label} samples")
plt.tight_layout()
fig_dir = (
self.util.get_path("fig_dir") + "../"
) # one up because of the runs
fig_dir = self.util.get_path("fig_dir") + "../" # one up because of the runs
exp_name = self.util.get_exp_name(only_data=True)
format = self.util.config_val("PLOT", "format", "png")
filename = f"{fig_dir}{exp_name}EXPL_{model_s}.{format}"
Expand All @@ -73,36 +69,28 @@ def analyse(self):
plt.close(fig)
# result file
res_dir = self.util.get_path("res_dir")
file_name = f"{res_dir}{self.util.get_exp_name(only_data=True)}EXPL_{model_s}.txt"
file_name = (
f"{res_dir}{self.util.get_exp_name(only_data=True)}EXPL_{model_s}.txt"
)
with open(file_name, "w") as text_file:
text_file.write(
"features in order of decreasing importance according to model"
f" {model_s}:\n"
+ f"{str(df_imp.feats.values)}\n"
f" {model_s}:\n" + f"{str(df_imp.feats.values)}\n"
)

df_imp.to_csv(file_name, mode="a")

# check if feature distributions should be plotted
plot_feats = self.util.config_val(
"EXPL", "feature_distributions", False
)
plot_feats = self.util.config_val("EXPL", "feature_distributions", False)
if plot_feats:
sample_selection = self.util.config_val(
"EXPL", "sample_selection", "all"
)
if self.util.exp_is_classification():
for feature in df_imp.feats:
# plot_feature(self, title, feature, label, df_labels, df_features):
_plots = Plots()
_plots.plot_feature(
sample_selection,
feature,
"class_label",
self.df_labels,
self.X,
)
else:
self.util.debug(
"can't plot feature distributions if not classification"
sample_selection = self.util.config_val("EXPL", "sample_selection", "all")
for feature in df_imp.feats:
# plot_feature(self, title, feature, label, df_labels, df_features):
_plots = Plots()
_plots.plot_feature(
sample_selection,
feature,
self.target,
self.df_labels,
self.features,
)
17 changes: 7 additions & 10 deletions nkululeko/feat_extract/feats_wav2vec2.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,10 @@ def __init__(self, name, data_df, feat_type):
cuda = "cuda" if torch.cuda.is_available() else "cpu"
self.device = self.util.config_val("MODEL", "device", cuda)
self.model_initialized = False
self.feat_type = feat_type
if feat_type == "wav2vec":
self.feat_type = "wav2vec2-large-robust-ft-swbd-300h"
else:
self.feat_type = feat_type

def init_model(self):
# load model
Expand All @@ -37,9 +40,7 @@ def extract(self):
"""Extract the features or load them from disk if present."""
store = self.util.get_path("store")
storage = f"{store}{self.name}.pkl"
extract = self.util.config_val(
"FEATS", "needs_feature_extraction", False
)
extract = self.util.config_val("FEATS", "needs_feature_extraction", False)
no_reuse = eval(self.util.config_val("FEATS", "no_reuse", "False"))
if extract or no_reuse or not os.path.isfile(storage):
if not self.model_initialized:
Expand All @@ -57,15 +58,11 @@ def extract(self):
frame_offset=int(start.total_seconds() * 16000),
num_frames=int((end - start).total_seconds() * 16000),
)
assert (
sampling_rate == 16000
), f"got {sampling_rate} instead of 16000"
assert sampling_rate == 16000, f"got {sampling_rate} instead of 16000"
emb = self.get_embeddings(signal, sampling_rate, file)
emb_series[idx] = emb
# print(f"emb_series shape: {emb_series.shape}")
self.df = pd.DataFrame(
emb_series.values.tolist(), index=self.data_df.index
)
self.df = pd.DataFrame(emb_series.values.tolist(), index=self.data_df.index)
# print(f"df shape: {self.df.shape}")
self.df.to_pickle(storage)
try:
Expand Down
20 changes: 10 additions & 10 deletions nkululeko/feat_extract/feinberg_praat.py
Original file line number Diff line number Diff line change
Expand Up @@ -466,11 +466,11 @@ def get_speech_rate(file_index):
cols = [
"nsyll",
"npause",
"dur(s)",
"phonationtime(s)",
"speechrate(nsyll / dur)",
"articulation rate(nsyll / phonationtime)",
"ASD(speakingtime / nsyll)",
"dur_s",
"phonationtime_s",
"speechrate_nsyll_dur",
"articulation_rate_nsyll_phonationtime",
"ASD_speakingtime_nsyll",
]
datalist = []
for idx, (wave_file, start, end) in enumerate(tqdm(file_index.to_list())):
Expand Down Expand Up @@ -621,10 +621,10 @@ def speech_rate(sound):
speechrate_dictionary = {
"nsyll": voicedcount,
"npause": npause,
"dur(s)": originaldur,
"phonationtime(s)": intensity_duration,
"speechrate(nsyll / dur)": speakingrate,
"articulation rate(nsyll / phonationtime)": articulationrate,
"ASD(speakingtime / nsyll)": asd,
"dur_s": originaldur,
"phonationtime_s": intensity_duration,
"speechrate_nsyll_dur": speakingrate,
"articulation_rate_nsyll_phonationtime": articulationrate,
"ASD_speakingtime_nsyll": asd,
}
return speechrate_dictionary
2 changes: 1 addition & 1 deletion nkululeko/feature_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def extract(self):
self.featExtractor = TRILLset(
f"{store_name}_{self.feats_designation}", self.data_df
)
elif feats_type.startswith("wav2vec2"):
elif feats_type.startswith("wav2vec"):
from nkululeko.feat_extract.feats_wav2vec2 import Wav2vec2

self.featExtractor = Wav2vec2(
Expand Down
Loading

0 comments on commit 8eecac4

Please sign in to comment.