Skip to content

Commit

Permalink
add support for new audio models
Browse files Browse the repository at this point in the history
  • Loading branch information
gferraro committed Apr 12, 2023
1 parent 79a2a65 commit 6ad89e8
Show file tree
Hide file tree
Showing 2 changed files with 77 additions and 1 deletion.
63 changes: 63 additions & 0 deletions Melt/custommel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import librosa
import numpy as np

# replicating code from librosa but changing break freq from 700 -> 1750
def hz_to_mel(frequencies, break_freq):
frequencies = np.array(frequencies)
return 2595.0 * np.log10(1.0 + frequencies / break_freq)


def mel_frequencies(n_mels, fmin, fmax, break_freq):
min_mel = hz_to_mel(fmin, break_freq)
max_mel = hz_to_mel(fmax, break_freq)
mels = np.linspace(min_mel, max_mel, n_mels)

return break_freq * (10.0 ** (mels / 2595.0) - 1.0)


def mel_f(sr, n_mels, fmin, fmax, n_fft, break_freq):

# Initialize the weights
n_mels = int(n_mels)
weights = np.zeros((n_mels, int(1 + n_fft // 2)), dtype=np.float32)

# Center freqs of each FFT bin
fftfreqs = librosa.fft_frequencies(sr=sr, n_fft=n_fft)

# 'Center freqs' of mel bands - uniformly spaced between limits
mel_f = mel_frequencies(n_mels + 2, fmin, fmax, break_freq)

fdiff = np.diff(mel_f)
ramps = np.subtract.outer(mel_f, fftfreqs)

for i in range(n_mels):
# lower and upper slopes for all bins
lower = -ramps[i] / fdiff[i]
upper = ramps[i + 2] / fdiff[i + 1]

# .. then intersect them with each other and zero
weights[i] = np.maximum(0, np.minimum(lower, upper))

# slaney
# Slaney-style mel is scaled to be approx constant energy per channel
enorm = 2.0 / (mel_f[2 : n_mels + 2] - mel_f[:n_mels])
weights *= enorm[:, np.newaxis]

# Only check weights if f_mel[0] is positive
if not np.all((mel_f[:-2] == 0) | (weights.max(axis=1) > 0)):
# This means we have an empty channel somewhere
print(
"Empty filters detected in mel frequency basis. "
"Some channels will produce empty responses. "
"Try increasing your sampling rate (and fmax) or "
"reducing n_mels."
)

return weights


def mel_spec(stft, sr, n_fft, hop_length, n_mels, fmin, fmax, break_freq=1750):
# fft_windows = librosa.stft(data, n_fft=n_fft, hop_length=hop_length)
magnitude = np.abs(stft) ** 2
mels = mel_f(sr, n_mels, fmin, fmax, n_fft, break_freq)
return mels.dot(magnitude)
15 changes: 14 additions & 1 deletion Melt/identify_bird.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import json
import audioread.ffdec # Use ffmpeg decoder
import math
from custommel import mel_spec

fmt = "%(process)d %(thread)s:%(levelname)7s %(message)s"

Expand Down Expand Up @@ -37,13 +38,21 @@ def load_samples(
mel_break=1750,
htk=False,
n_mels=80,
fmin=50,
fmax=11000,
):
logging.debug(
"Loading samples with length %s stride %s hop length %s and mean_sub %s",
"Loading samples with length %s stride %s hop length %s and mean_sub %s mfcc %s break %s htk %s n mels %s fmin %s fmax %s",
segment_length,
stride,
hop_length,
mean_sub,
use_mfcc,
mel_break,
htk,
n_mels,
fmin,
fmax,
)
frames, sr = load_recording(path)
mels = []
Expand Down Expand Up @@ -141,6 +150,8 @@ def classify(file, model_file):
n_mels = meta.get("n_mels", 80)
mel_break = meta.get("mel_break", 1750)
htk = meta.get("htk", False)
fmin = meta.get("fmin", 50)
fmax = meta.get("fmax", 11000)

samples, length = load_samples(
file,
Expand All @@ -152,6 +163,8 @@ def classify(file, model_file):
htk=htk,
mel_break=mel_break,
n_mels=n_mels,
fmin=fmin,
fmax=fmax,
)
predictions = model.predict(samples, verbose=0)
tracks = []
Expand Down

0 comments on commit 6ad89e8

Please sign in to comment.