add support for new audio models

TheCacophonyProject · Apr 12, 2023 · 6ad89e8 · 6ad89e8
1 parent 79a2a65
commit 6ad89e8
Show file tree

Hide file tree

Showing 2 changed files with 77 additions and 1 deletion.
diff --git a/Melt/custommel.py b/Melt/custommel.py
@@ -0,0 +1,63 @@
+import librosa
+import numpy as np
+
+# replicating code from librosa but changing break freq from 700 -> 1750
+def hz_to_mel(frequencies, break_freq):
+    frequencies = np.array(frequencies)
+    return 2595.0 * np.log10(1.0 + frequencies / break_freq)
+
+
+def mel_frequencies(n_mels, fmin, fmax, break_freq):
+    min_mel = hz_to_mel(fmin, break_freq)
+    max_mel = hz_to_mel(fmax, break_freq)
+    mels = np.linspace(min_mel, max_mel, n_mels)
+
+    return break_freq * (10.0 ** (mels / 2595.0) - 1.0)
+
+
+def mel_f(sr, n_mels, fmin, fmax, n_fft, break_freq):
+
+    # Initialize the weights
+    n_mels = int(n_mels)
+    weights = np.zeros((n_mels, int(1 + n_fft // 2)), dtype=np.float32)
+
+    # Center freqs of each FFT bin
+    fftfreqs = librosa.fft_frequencies(sr=sr, n_fft=n_fft)
+
+    # 'Center freqs' of mel bands - uniformly spaced between limits
+    mel_f = mel_frequencies(n_mels + 2, fmin, fmax, break_freq)
+
+    fdiff = np.diff(mel_f)
+    ramps = np.subtract.outer(mel_f, fftfreqs)
+
+    for i in range(n_mels):
+        # lower and upper slopes for all bins
+        lower = -ramps[i] / fdiff[i]
+        upper = ramps[i + 2] / fdiff[i + 1]
+
+        # .. then intersect them with each other and zero
+        weights[i] = np.maximum(0, np.minimum(lower, upper))
+
+    # slaney
+    # Slaney-style mel is scaled to be approx constant energy per channel
+    enorm = 2.0 / (mel_f[2 : n_mels + 2] - mel_f[:n_mels])
+    weights *= enorm[:, np.newaxis]
+
+    # Only check weights if f_mel[0] is positive
+    if not np.all((mel_f[:-2] == 0) | (weights.max(axis=1) > 0)):
+        # This means we have an empty channel somewhere
+        print(
+            "Empty filters detected in mel frequency basis. "
+            "Some channels will produce empty responses. "
+            "Try increasing your sampling rate (and fmax) or "
+            "reducing n_mels."
+        )
+
+    return weights
+
+
+def mel_spec(stft, sr, n_fft, hop_length, n_mels, fmin, fmax, break_freq=1750):
+    # fft_windows = librosa.stft(data, n_fft=n_fft, hop_length=hop_length)
+    magnitude = np.abs(stft) ** 2
+    mels = mel_f(sr, n_mels, fmin, fmax, n_fft, break_freq)
+    return mels.dot(magnitude)
diff --git a/Melt/identify_bird.py b/Melt/identify_bird.py
@@ -7,6 +7,7 @@
 import json
 import audioread.ffdec  # Use ffmpeg decoder
 import math
+from custommel import mel_spec
 
 fmt = "%(process)d %(thread)s:%(levelname)7s %(message)s"
 
@@ -37,13 +38,21 @@ def load_samples(
     mel_break=1750,
     htk=False,
     n_mels=80,
+    fmin=50,
+    fmax=11000,
 ):
     logging.debug(
-        "Loading samples with length %s stride %s hop length %s and mean_sub %s",
+        "Loading samples with length %s stride %s hop length %s and mean_sub %s mfcc %s break %s htk %s n mels %s fmin %s fmax %s",
         segment_length,
         stride,
         hop_length,
         mean_sub,
+        use_mfcc,
+        mel_break,
+        htk,
+        n_mels,
+        fmin,
+        fmax,
     )
     frames, sr = load_recording(path)
     mels = []
@@ -141,6 +150,8 @@ def classify(file, model_file):
     n_mels = meta.get("n_mels", 80)
     mel_break = meta.get("mel_break", 1750)
     htk = meta.get("htk", False)
+    fmin = meta.get("fmin", 50)
+    fmax = meta.get("fmax", 11000)
 
     samples, length = load_samples(
         file,
@@ -152,6 +163,8 @@ def classify(file, model_file):
         htk=htk,
         mel_break=mel_break,
         n_mels=n_mels,
+        fmin=fmin,
+        fmax=fmax,
     )
     predictions = model.predict(samples, verbose=0)
     tracks = []