forked from naotokui/SpectrogramVAE
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutil.py
62 lines (50 loc) · 1.77 KB
/
util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import librosa
import numpy as np
import json
param = {
"N_FFT": 1024,
"HOP_LENGTH": 256,
"SAMPLING_RATE": 16000,
"MELSPEC_BANDS": 128,
"sample_secs": 2
}
N_FFT = param['N_FFT']
HOP_LENGTH = param['HOP_LENGTH']
SAMPLING_RATE = param['SAMPLING_RATE']
MELSPEC_BANDS = param['MELSPEC_BANDS']
sample_secs = param['sample_secs']
num_samples_dataset = int(sample_secs * SAMPLING_RATE)
# Function to read in an audio file and return a mel spectrogram
def get_melspec(filepath_or_audio, hop_length=HOP_LENGTH, n_mels=MELSPEC_BANDS, n_samples=num_samples_dataset,
sample_secs=sample_secs, as_tf_input=False):
y_tmp = np.zeros(n_samples)
# Load a little more than necessary as a buffer
load_duration = None if sample_secs == None else 1.1 * sample_secs
# Load audio file or take given input
if type(filepath_or_audio) == str:
y, sr = librosa.core.load(filepath_or_audio, sr=SAMPLING_RATE, mono=True, duration=load_duration)
else:
y = filepath_or_audio
sr = SAMPLING_RATE
# Truncate or pad
if n_samples:
if len(y) >= n_samples:
y_tmp = y[:n_samples]
lentgh_ratio = 1.0
else:
y_tmp[:len(y)] = y
lentgh_ratio = len(y) / n_samples
else:
y_tmp = y
lentgh_ratio = 1.0
# sfft -> mel conversion
melspec = librosa.feature.melspectrogram(y=y_tmp, sr=sr,
n_fft=N_FFT, hop_length=hop_length, n_mels=n_mels)
S = librosa.power_to_db(melspec, np.max)
if as_tf_input:
S = spec_to_input(S)
return S, lentgh_ratio
def spec_to_input(spec):
specs_out = (spec + 80.0) / 80.0
specs_out = np.expand_dims(np.expand_dims(specs_out, axis=0), axis=3)
return np.float32(specs_out)