diff --git a/Melt/.gitignore b/.gitignore similarity index 58% rename from Melt/.gitignore rename to .gitignore index f5abbba..60ac581 100644 --- a/Melt/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ __pycache__ *.pyc -venv_* +env_* +.idea + diff --git a/Melt/chain.py b/Melt/chain.py index e65c124..ac5a6f9 100755 --- a/Melt/chain.py +++ b/Melt/chain.py @@ -9,91 +9,21 @@ import time import common -import ensemble -import squawk - - -def noise_reduce(file_name): - import noise_reduction - sample_rate = 48000 - source = common.load_audio_file_as_numpy_array(file_name, sample_rate) - nr = noise_reduction.noise_reduce(source, sample_rate) - return (source, nr, sample_rate) - - -def find_nr_squawks_from_file_name(file_name): - (source, nr, sample_rate) = noise_reduce(file_name) - squawks = squawk.find_squawks(nr, sample_rate) - return (source, nr, squawks, sample_rate) - - -def species_identify(source, nr, squawks, sample_rate, verbose=False): - import json - import numpy - import squawk - - e = ensemble.ensemble() - for s in squawks: - waveform = squawk.extract_squawk_waveform(nr, sample_rate, s) - e.append_waveform(waveform) - model_version = 'sc_ah' - p = e.apply_model(model_version) - - label_file_name = 'model/model_%s_label.json' % model_version - with open(label_file_name, 'r') as f: - label = json.loads(f.read()) - - tag = [] - for row, squawk in zip(p, squawks): - mm = numpy.argmax(row) - m2 = numpy.argsort(row)[-2] - species = label[mm] - if not verbose: - if row[mm] < 0.75: - continue - if row[m2] > 0.3: - continue - if species in 'noise,other,unknown'.split(','): - continue - - entry = {} - entry['species'] = species - entry['begin_s'] = round(squawk['begin_i'] / sample_rate, 2) - entry['end_s'] = round(squawk['end_i'] / sample_rate, 2) - if verbose: - entry['confidence'] = '%d%%' % (100 * row[mm]) - entry['or'] = '%s (%d%%)' % (label[m2], 100 * row[m2]) - tag.append(entry) - result = {} - result['species_identify'] = tag - result['species_identify_version'] = '2019-12-12_A' - return result +from identify_species import identify_species -def speech_detect(source, nr, squawks, sample_rate): - e = ensemble.ensemble() - for s in squawks: - waveform = squawk.extract_squawk_waveform(nr, sample_rate, s) - e.append_waveform(waveform) - p = e.apply_model('sd_aa') +def species_identify(file_name, metadata_name, models): + labels = identify_species(file_name, metadata_name, models) result = {} - result['speech_detection_version'] = '2019-10-30_A' - human_squawk_count = 0 - for(pb, ph) in p: - if pb < 0.1 and ph > 0.95: - human_squawk_count += 1 - result['speech_detection'] = (human_squawk_count > 3) + result['species_identify'] = labels + result['species_identify_version'] = '2021-02-01' return result - -def examine(file_name, summary): +def examine(file_name, metadata_name, models, summary): import cacophony_index ci = cacophony_index.calculate(file_name) summary.update(ci) - nss = find_nr_squawks_from_file_name(file_name) - summary.update(speech_detect(*nss)) - summary.update(species_identify(*nss)) - + summary.update(species_identify(file_name, metadata_name, models)) def main(): argv = sys.argv @@ -106,20 +36,8 @@ def main(): import cacophony_index ci = cacophony_index.calculate(argv[2]) summary.update(ci) - elif argv[1] == '-examine': - examine(argv[2], summary) - elif argv[1] == '-noise_reduce': - (source, nr, sample_rate) = noise_reduce(argv[2]) - common.write_audio_to_file( - 'temp/noise_reduce_stereo.ogg', sample_rate, source, nr) - elif argv[1] == '-species_identify': - nss = find_nr_squawks_from_file_name(argv[2]) - summary.update(species_identify(*nss)) - elif argv[1] == '-speech_detect': - nss = find_nr_squawks_from_file_name(argv[2]) - summary.update(speech_detect(*nss)) else: - result = -1 + examine(argv[1], argv[2], argv[3], summary) t1 = time.time() diff --git a/Melt/ensemble.py b/Melt/ensemble.py deleted file mode 100644 index d7f0ea7..0000000 --- a/Melt/ensemble.py +++ /dev/null @@ -1,55 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright (C) 2019 Chris Blackbourn - -"""Aggregating audio snippets and applying AI models.""" - -import numpy - -import common - - -def pre_norm_tf(s, scale=1.0, slide=0): - target_width = 32768 - - adjust_pre = target_width // 2 - s.shape[0] // 2 - adjust_pre += slide - - if adjust_pre < 0: - s = s[-adjust_pre:target_width - adjust_pre] - elif adjust_pre > 0: - s = numpy.pad(s, (adjust_pre, 0), 'constant') - - adjust_post = target_width - s.shape[0] - if adjust_post < 0: - s = s[:adjust_post] - elif adjust_post > 0: - s = numpy.pad(s, (0, adjust_post), 'constant') - - s = s.astype(float) - - s = s * common.get_window_const(target_width, 'hamming', scale) - - result = numpy.array(s) - return result.reshape((target_width, 1)) - - -class ensemble: - - def __init__(self): - self.xList = [] - - def append_waveform(self, waveform): - self.xList.append(pre_norm_tf(waveform)) - - def apply_model(self, flavor): - if not self.xList: - return [] - import os - os.environ['TF_CPP_MIN_LOG_LEVEL'] = '4' - import tensorflow - prefix = common.get_source_prefix() - model_name = '%smodel/model_%s.h5' % (prefix, flavor) - model = tensorflow.keras.models.load_model(model_name) - npx = numpy.array(self.xList) - return model.predict(npx) diff --git a/Melt/identify_species.py b/Melt/identify_species.py new file mode 100644 index 0000000..faca20c --- /dev/null +++ b/Melt/identify_species.py @@ -0,0 +1,157 @@ + +import librosa +import numpy as np +import os +import tensorflow as tf +print(tf.__version__) + +frequency_min = 600 +frequency_max = 1200 +num_bands = int((frequency_max - frequency_min) / 10) +slices_per_second = 20 +seconds_per_sample = 3.0 +slices_per_sample = int(slices_per_second * seconds_per_sample) +sample_slide_seconds = 1.0 +sample_slide_slices = int(sample_slide_seconds * slices_per_second) +activation_threshold = 1.0 + +model_file_name = 'saved_model.pb' + + +def _load_sample(path): + frames, sr = librosa.load(path, sr=None) + + # generate spectrogram + nfft = int(sr / 10) + stft = librosa.stft(frames, n_fft=nfft, hop_length=int(nfft / 2)) + npspec = np.abs(stft)[int(frequency_min / 10):int(frequency_max / 10)] + + return sr, npspec + +def _model_paths(basepath): + namelist = os.listdir(basepath) + pathlist = list() + for name in namelist: + namepath = os.path.join(basepath, name) + if os.path.isdir(namepath): + pathlist = pathlist + _model_paths(namepath) + elif namepath.endswith(model_file_name): + pathlist.append(basepath) + return pathlist + +def _find_likely_span(liklihoods, start_times, first, last): + """ + Find the likelihood of a morepork call, and the actual time span, corresponding to a span of consecutive samples + with morepork predicted. We're not currently predicting the actual time of a particular morepork call, but we can + make inferences based on the assumption that every sample containing an entire morepork call will give a positive + prediction. This uses heuristics to handle the common cases of two, three, or more samples. + :param liklihoods: percentage liklihoods for all samples + :type liklihoods: list(float) + :param start_times: start time for each sample (normally same interval, but last may be shorter) + :type start_times: list(float) + :param first: first sample index in range with morepork predicted + :type first: int + :param last: last sample index in range with morepork predicted + :type last: int + :return: liklihood, start_time, end_time + :rtype: float, float, float + """ + count = last - first + first_start_time = start_times[first] + last_end_time = start_times[last] + seconds_per_sample + if count == 0: + # single isolated sample, just return the liklihood and time span for that sample + return liklihoods[first], first_start_time, last_end_time + elif count == 1: + # two consecutive samples, assume call in the overlap span and return maximum liklihood with that span + liklihood = max(liklihoods[first], liklihoods[last]) + return liklihood, first_start_time + sample_slide_seconds, first_start_time + seconds_per_sample + elif count == 2: + # three consecutive samples, probably two calls if max likelihood are the two end values + max_liklihood = max(liklihoods[first:last + 1]) + min_liklihood = min(liklihoods[first:last + 1]) + if max_liklihood == liklihoods[first + 1]: + # maximum liklihood is middle sample, assume that's where the call actually is + return max_liklihood, start_times[first+1], start_times[first+1] + seconds_per_sample + elif min_liklihood == liklihoods[first]: + # lowest liklihood is the first sample, so assume call probably in overlap and perhaps a second one present + return max_liklihood, start_times[first+1], last_end_time + elif min_liklihood == liklihoods[last]: + # lowest liklihood is the last sample, so assume call probably in first and perhaps a second one present + return max_liklihood, first_start_time, start_times[first+1] + seconds_per_sample + else: + # no good guessing, just return the full span + return max_liklihood, first_start_time, last_end_time + else: + # more than three consecutive samples, just see if we can safely trim the non-overlapping end spans + max_liklihood = max(liklihoods[first:last + 1]) + if max_liklihood > liklihoods[first]: + if max_liklihood > liklihoods[last]: + # first and last not highest likelihood, trim off the non-overlapping end spans + return max_liklihood, start_times[first+1], start_times[last-1] + seconds_per_sample + else: + # last is highest likelihood, just trim off non-overlapping start + return max_liklihood, start_times[first+1], last_end_time + elif max_liklihood > liklihoods[last]: + # first is highest likelihood, last is not, just trim off non-overlapping end + return max_liklihood, first_start_time, start_times[last-1] + seconds_per_sample + else: + # first and last both highest likelihood, just return the entire time + return max_liklihood, first_start_time, last_end_time + +def build_entry(begin, end, species, activation): + entry = {} + entry['begin_s'] = begin + entry['end_s'] = end + entry['species'] = species + entry['liklihood'] = round(activation * 0.01, 2) + return entry + +def identify_species(recording, metadata, models): + + # get spectrogram to be checked + sr, npspec = _load_sample(recording) + + # divide recording into samples of appropriate length + samples = [] + start_times = [] + for base in range(0, npspec.shape[1], sample_slide_slices): + limit = base + slices_per_sample + if limit > npspec.shape[1]: + limit = npspec.shape[1] + start = limit - slices_per_sample + start_times.append(start / slices_per_second) + sample = npspec[:, start:limit] + sample = librosa.amplitude_to_db(sample, ref=np.max) + sample = sample / abs(sample.min()) + 1.0 + samples.append(sample.reshape(sample.shape + (1,))) + samples = np.array(samples) + + # accumulate results from all models + activations_sum = np.zeros(len(samples)) + model_paths = _model_paths(models) + for path in model_paths: + model = tf.keras.models.load_model(path) + activations = model.predict(samples).flatten() + activations_sum += activations + + # generate labels from summed activations + labels = [] + liklihoods = [round(v * 100 / len(model_paths)) for v in activations_sum] + first_index = -1 + for i in range(len(samples)): + if activations_sum[i] >= activation_threshold: + # only collect sample ranges where the summed activations are above the threshold value + if first_index < 0: + first_index = i + last_index = i + elif first_index >= 0: + # just past the end of a sample range with activations, record it and clear + liklihood, start_time, end_time = _find_likely_span(liklihoods, start_times, first_index, last_index) + labels.append(build_entry(start_time, end_time, 'morepork', liklihood)) + first_index = -1 + if first_index >= 0: + # record final sample range with activations + liklihood, start_time, end_time = _find_likely_span(liklihoods, start_times, first_index, last_index) + labels.append(build_entry(start_time, end_time, 'morepork', liklihood)) + return labels \ No newline at end of file diff --git a/Melt/melt.py b/Melt/melt.py deleted file mode 100755 index 8f71d64..0000000 --- a/Melt/melt.py +++ /dev/null @@ -1,93 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -# Copyright (C) 2019 Chris Blackbourn - -"""A multitool for turning raw data, mostly audio, into structured information.""" - -import sys - -import common - - -def show_help(): - """Displays help.""" - print(""" -Melt - -> ./melt.py setup - """) - - -def setup(): - """Hints for setting up virtualenv and ffmpeg.""" - osn = common.get_os_short_name() - venv_prefix = common.get_venv_prefix() - dir_name = common.get_config_dir() - - if osn == 'lnx': - # if symlink problem: - # VBoxManage setextradata VM_NAME - # VBoxInternal2/SharedFoldersEnableSymlinksCreate/FOLDER_NAME 1 - - print('sudo apt-get install ffmpeg') - print('sudo apt-get install python3-venv') - - if osn == 'mac': - print('brew install ffmpeg') - # print('brew install opus-tools') #optional - - if osn == 'win': - print('Install https://www.ffmpeg.org/download.html') - - cmd = 'python3 -m venv %s' % dir_name - print(cmd) - cmd = venv_prefix + ' pip install numpy scipy' - print(cmd) - cmd = venv_prefix + ' pip install tensorflow-gpu' - print(cmd) - print(venv_prefix) - - -def pex(): - import os - #os.system('pip install pex') - cmd = 'pex . -c pex_entry.py -o melt.pex' - os.system(cmd) - - -def main(): - """Main entrypoint for Melt, in normal usage, everything starts here.""" - - argv = sys.argv - venv_prefix = common.get_venv_prefix() - dir_name = common.get_config_dir() - - if len(argv) == 1: - show_help() - return 0 - - if argv[1] == 'pex': - pex() - return 0 - - if argv[1] == 'setup': - setup() - return 0 - - suffix = argv[1].split('.')[-1].lower() - exts = '3gp,aac,ac3,adts,aif,aifc,caf,dts,dtshd,flac,gsm,m4a,mp3,mp4,mpa,oga,ogg,opus,ra,rif,wav' - - source_prefix = common.get_source_prefix() - vpchain_command = '%s python %schain.py ' % (venv_prefix, source_prefix) - if suffix in exts.split(','): - cmd = '%s-examine "%s"' % (vpchain_command, argv[1]) - common.execute(cmd) - return 0 - - show_help() - return -1 - - -if __name__ == '__main__': - sys.exit(main()) diff --git a/Melt/model/model_sc_aa.h5 b/Melt/model/model_sc_aa.h5 deleted file mode 100755 index c73ee53..0000000 Binary files a/Melt/model/model_sc_aa.h5 and /dev/null differ diff --git a/Melt/model/model_sc_aa_label.json b/Melt/model/model_sc_aa_label.json deleted file mode 100755 index 39f79c9..0000000 --- a/Melt/model/model_sc_aa_label.json +++ /dev/null @@ -1,39 +0,0 @@ -[ - "blackbird", - "cat", - "cuckoo", - "dog", - "duck", - "fantail", - "finch", - "gull", - "harrier", - "honeyeater", - "human", - "kaka", - "kakapo", - "kea", - "kiwi", - "kokako", - "magpie", - "morepork", - "myna", - "noise", - "north-island-saddleback", - "pukeko", - "rifleman", - "robin", - "sacred-kingfisher", - "sheep", - "silvereye", - "skylark", - "south-island-saddleback", - "south-island-takahe", - "sparrow", - "spotted-dove", - "starling", - "thrush", - "unknown", - "warbler", - "weka" -] \ No newline at end of file diff --git a/Melt/model/model_sc_ah.h5 b/Melt/model/model_sc_ah.h5 deleted file mode 100755 index c703d17..0000000 Binary files a/Melt/model/model_sc_ah.h5 and /dev/null differ diff --git a/Melt/model/model_sc_ah_label.json b/Melt/model/model_sc_ah_label.json deleted file mode 100755 index d6d5f2d..0000000 --- a/Melt/model/model_sc_ah_label.json +++ /dev/null @@ -1,33 +0,0 @@ -[ - "noise", - "sparrowtail", - "finch", - "gull", - "duck", - "honeyeater", - "blackbird", - "warbler", - "starling", - "magpie", - "harrier", - "thrush", - "swallow", - "silvereye", - "kingfisher", - "pukeko", - "myna", - "goose", - "morepork", - "kokako", - "human", - "robin", - "kaka", - "cuckoo", - "weka", - "kea", - "kiwi", - "sheep", - "heron", - "dove", - "other" -] \ No newline at end of file diff --git a/Melt/model/model_sd_aa.h5 b/Melt/model/model_sd_aa.h5 deleted file mode 100644 index 33c0574..0000000 Binary files a/Melt/model/model_sd_aa.h5 and /dev/null differ diff --git a/Melt/noise_reduction.py b/Melt/noise_reduction.py deleted file mode 100755 index a432dac..0000000 --- a/Melt/noise_reduction.py +++ /dev/null @@ -1,127 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright (C) 2019 Chris Blackbourn - -"""Noise reduction.""" - -import numpy -import scipy - -import common - - -class spectrogram_helper: - def __init__(self, source_pad, spectrogram, stride, sample_rate): - self.spectrogram = spectrogram - (self.block_count, dct_width) = spectrogram.shape - self.stride = stride - - window_c = common.get_window_const(dct_width, 'tukey') - - for index in range(self.block_count): - block_index = index * stride - block = source_pad[block_index:block_index + dct_width] * window_c - dct = scipy.fftpack.dct(block) - spectrogram[index] = dct - - self.buckets = [] - msw = 50 * sample_rate // stride - max_spec_width = min(msw, self.block_count) - division_count = max(int((self.block_count * 1.7) / max_spec_width), 1) - for i in range(division_count): - t0 = 0 - if i: - t0 = (self.block_count - max_spec_width) * \ - i // (division_count - 1) - t1 = min(t0 + max_spec_width, self.block_count) - self.buckets.append((t0, t1)) - - self.currentBucket = -2 - - def get_tolerance(self, index): - qb = (index, index, index) - q = min(self.buckets, key=lambda x: abs(x[0] + x[1] - 2 * index)) - if self.currentBucket != q: - self.currentBucket = q - (t0, t1) = q - bin_medians = numpy.median(abs(self.spectrogram[t0:t1, ]), axis=0) - self.tolerance = 4 * \ - numpy.convolve(bin_medians, numpy.ones(8) / 8)[4:-3] - - return self.tolerance - - -def noise_reduce_dct(source, sample_rate, options): - original_sample_count = source.shape[0] - dct_width = 2048 - - trim_width = int(dct_width / 8) - stride = dct_width - trim_width * 3 - - block_count = (original_sample_count + stride - 1) // stride - source_pad = numpy.pad(source, (stride, stride * 2), 'reflect') - - #print('Building spectrogram') - spectrogram = numpy.empty((block_count, dct_width)) - - sph = spectrogram_helper(source_pad, spectrogram, stride, sample_rate) - - # anything below bass_cut_off_freq requires specialised techniques - bass_cut_off_freq = 100 - bass_cut_off_band = bass_cut_off_freq * 2 * dct_width // sample_rate - - spectrogram_trimmed = numpy.empty((block_count, dct_width)) - rms_tab = numpy.empty(block_count) - - for index in range(block_count): - dct = spectrogram[index] - - mask = numpy.ones_like(dct) - mask[:bass_cut_off_band] *= 0 - - rms_tab[index] = common.rms(dct * mask) - - tolerance = sph.get_tolerance(index) - for band in range(dct_width): - if abs(dct[band]) < tolerance[band]: - mask[band] *= 0.0 - - maskCon = 10 * numpy.convolve(mask, numpy.ones(8) / 8)[4:-3] - - maskBin = numpy.where(maskCon > 0.1, 0, 1) - spectrogram_trimmed[index] = maskBin - - rms_cutoff = numpy.median(rms_tab) - - result_pad = numpy.zeros_like(source_pad) - for index in range(1, block_count - 1): - dct = spectrogram[index] - - trim3 = spectrogram_trimmed[index - 1] * \ - spectrogram_trimmed[index] * spectrogram_trimmed[index + 1] - dct *= (1 - trim3) - - if common.rms(dct) < rms_cutoff: - continue # too soft - - rt = scipy.fftpack.idct(dct) / (dct_width * 2) - - block_index = index * stride - result_pad[block_index + trim_width * 1:block_index + trim_width * - 2] += rt[trim_width * 1:trim_width * 2] * numpy.linspace(0, 1, trim_width) - result_pad[block_index + - trim_width * - 2:block_index + - trim_width * - 6] = rt[trim_width * - 2:trim_width * - 6] # *numpy.linspace(1,1,stride8*4) - result_pad[block_index + trim_width * 6:block_index + trim_width * - 7] = rt[trim_width * 6:trim_width * 7] * numpy.linspace(1, 0, trim_width) - - result = result_pad[stride:stride + original_sample_count] - return result - - -def noise_reduce(source, sample_rate, options={}): - return noise_reduce_dct(source, sample_rate, options) diff --git a/Melt/pex_entry.py b/Melt/pex_entry.py deleted file mode 100644 index b3a5630..0000000 --- a/Melt/pex_entry.py +++ /dev/null @@ -1,11 +0,0 @@ - -import chain -import common -import os - -sys.path.append(os.path.dirname(os.path.abspath(__file__))) - - -summary = {} -chain.examine(sys.argv[1], summary) -print(common.jsdump(summary)) diff --git a/Melt/setup.py b/Melt/setup.py index 6fbcfd8..018b343 100644 --- a/Melt/setup.py +++ b/Melt/setup.py @@ -1,16 +1,14 @@ from distutils.core import setup scripts = [ - 'pex_entry.py', 'cacophony_index.py', - 'common.py', 'chain.py', - 'ensemble.py', - 'noise_reduction.py', - 'squawk.py', + 'common.py', + 'identify_species.py', ] reqs = [ + 'librosa', 'numpy', 'scipy', 'tensorflow', diff --git a/Melt/squawk.py b/Melt/squawk.py deleted file mode 100755 index 4914194..0000000 --- a/Melt/squawk.py +++ /dev/null @@ -1,63 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright (C) 2019 Chris Blackbourn - -"""Squawk extraction and manipulation.""" - -import numpy - -import common - - -def paired_item(source): - source_iter = iter(source) - while True: - try: - yield next(source_iter).item(), next(source_iter).item() - except StopIteration: - return - - -def merge_paired_short_time(udarray, small_time): - paired_iter = paired_item(udarray) - r = None - for s in paired_iter: - if not r: - r = s - elif s[0] < r[1] + small_time: - r = r[0], s[1] - else: - yield r - r = s - if r: - yield r - - -def find_squawks(source, sample_rate): - result = [] - - source_pad = numpy.pad(source, 1) - tolerance = common.rms(source) / 3 - t = (abs(source_pad) > tolerance) - s = numpy.where(numpy.diff(t))[0] - small_time = int(sample_rate * 0.1) - for begin_index, end_index in merge_paired_short_time(s, small_time): - if begin_index + 0.05 * sample_rate < end_index: - squawk = {'begin_i': begin_index, 'end_i': end_index} - result.append(squawk) - return result - - -def extract_squawk_waveform(source, sample_rate, squawk): - begin_index = squawk['begin_i'] - end_index = squawk['end_i'] - width = int(0.05 * sample_rate) - t0 = max(0, begin_index - width) - t1 = min(source.shape[0], end_index + width) - result = source[t0:t1] - if not result.flags['WRITEABLE']: - result = result.copy() - result[:begin_index - t0] *= numpy.linspace(0, 1, begin_index - t0) - result[end_index - t0:t1 - t0] *= numpy.linspace(1, 0, t1 - end_index) - result *= 0.125 / common.rms(result) - return result diff --git a/requirements.txt b/requirements.txt index b2e54c5..2b7ac35 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,10 @@ h5py~=2.10.0 Keras-Applications~=1.0.8 Keras-Preprocessing~=1.1.0 -numpy~=1.17.3 +librosa~=0.7.2 +numpy~=1.19.2 scipy~=1.3.1 -tensorflow==2.0.0 +tensorflow~=2.4.1 Wave~=0.0.2