From 456eb117ae0340e809ee62f060dfaaceab182263 Mon Sep 17 00:00:00 2001 From: Justin Salamon Date: Wed, 9 May 2018 18:49:05 -0400 Subject: [PATCH 1/7] Add support for padding and set default behavior to True (i.e. applying padding) --- crepe/core.py | 30 ++++++++++++++++++++++++------ crepe/version.py | 2 +- 2 files changed, 25 insertions(+), 7 deletions(-) diff --git a/crepe/core.py b/crepe/core.py index 66ed879..c40852c 100644 --- a/crepe/core.py +++ b/crepe/core.py @@ -125,7 +125,7 @@ def to_viterbi_cents(salience): range(len(observations))]) -def get_activation(audio, sr): +def get_activation(audio, sr, center=True): """ Parameters @@ -135,6 +135,10 @@ def get_activation(audio, sr): sr : int Sample rate of the audio samples. The audio will be resampled if the sample rate is not 16 kHz, which is expected by the model. + center : boolean + - If `True` (default), the signal `audio` is padded so that frame + `D[:, t]` is centered at `audio[t * hop_length]`. + - If `False`, then `D[:, t]` begins at `audio[t * hop_length]` Returns ------- @@ -153,6 +157,11 @@ def get_activation(audio, sr): from resampy import resample audio = resample(audio, sr, model_srate) + # pad so that frames are centered around their timestamps (i.e. first frame + # is zero centered). + if center: + audio = np.pad(audio, 512, mode='constant', constant_values=0) + # make 1024-sample frames of the audio with hop length of 10 milliseconds hop_length = int(model_srate / 100) n_frames = 1 + int((len(audio) - 1024) / hop_length) @@ -168,7 +177,7 @@ def get_activation(audio, sr): return model.predict(frames, verbose=1) -def predict(audio, sr, viterbi=False): +def predict(audio, sr, viterbi=False, center=True): """ Perform pitch estimation on given audio @@ -180,7 +189,11 @@ def predict(audio, sr, viterbi=False): Sample rate of the audio samples. The audio will be resampled if the sample rate is not 16 kHz, which is expected by the model. viterbi : bool - Apply viterbi smoothing to the estimated pitch curve. False by default. + Apply viterbi smoothing to the estimated pitch curve. False by default. + center : boolean + - If `True` (default), the signal `audio` is padded so that frame + `D[:, t]` is centered at `audio[t * hop_length]`. + - If `False`, then `D[:, t]` begins at `audio[t * hop_length]` Returns ------- @@ -195,7 +208,7 @@ def predict(audio, sr, viterbi=False): activation: np.ndarray [shape=(T, 360)] The raw activation matrix """ - activation = get_activation(audio, sr) + activation = get_activation(audio, sr, center=center) confidence = activation.max(axis=1) if viterbi: @@ -212,7 +225,7 @@ def predict(audio, sr, viterbi=False): return time, frequency, confidence, activation -def process_file(file, output=None, viterbi=False, +def process_file(file, output=None, viterbi=False, center=True, save_activation=False, save_plot=False, plot_voicing=False): """ Use the input model to perform pitch estimation on the input file. @@ -226,6 +239,10 @@ def process_file(file, output=None, viterbi=False, be saved to the directory containing the input file. viterbi : bool Apply viterbi smoothing to the estimated pitch curve. False by default. + center : boolean + - If `True` (default), the signal `audio` is padded so that frame + `D[:, t]` is centered at `audio[t * hop_length]`. + - If `False`, then `D[:, t]` begins at `audio[t * hop_length]` save_activation : bool Save the output activation matrix to an .npy file. False by default. save_plot : bool @@ -249,7 +266,8 @@ def process_file(file, output=None, viterbi=False, print("CREPE: Could not read %s" % file, file=sys.stderr) raise - time, frequency, confidence, activation = predict(audio, sr, viterbi) + time, frequency, confidence, activation = predict(audio, sr, viterbi, + center=center) # write prediction as TSV f0_file = output_path(file, ".f0.csv", output) diff --git a/crepe/version.py b/crepe/version.py index 2d430d1..1fb86c7 100644 --- a/crepe/version.py +++ b/crepe/version.py @@ -1 +1 @@ -version = '0.0.4' +version = '0.0.5' From 692f3b4a39c406b71d79b31d90e3d3befdbade4a Mon Sep 17 00:00:00 2001 From: Justin Salamon Date: Thu, 10 May 2018 11:31:56 -0400 Subject: [PATCH 2/7] Add explanation about timestamps (centered frames) and organize in subsections --- README.md | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 4c21f73..daf733d 100644 --- a/README.md +++ b/README.md @@ -58,16 +58,47 @@ The resulting `audio_file.f0.csv` contains 3 columns: the first with timestamps 0.08,199.678,0.775208 ... -By default CREPE does not apply temporal smoothing to the pitch curve, but Viterbi smoothing is supported via the optional `--viterbi` command line argument. The script can also optionally save the output activation matrix of the model to an npy file (`--save-activation`), where the matrix dimensions are (n_frames, 360) using a hop size of 10 ms (there are 360 pitch bins covering 20 cents each). The script can also output a plot of the activation matrix (`--save-plot`), saved to `audio_file.activation.png` including an optional visual representation of the model's voicing detection (`--plot-voicing`). Here's an example plot of the activation matrix (without the voicing overlay) for an excerpt of male singing voice: +#### Timestamps +Following the convention adopted by popular audio processing libraries such as +[Essentia](http://essentia.upf.edu/) and [Librosa](https://librosa.github.io/librosa/), +from v0.0.5 onwards CREPE will pad the input signal such that the first frame +is zero-centered (the center of the frame corresponds to time 0) and generally +all frames are centered around their corresponding timestamp, i.e. frame +`D[:, t]` is centered at `audio[t * hop_length]`. This behavior can be changed +by specifying the optional `--no-centering` flag, in which case the first frame +will *start* at time zero and generally frame `D[:, t]` will *begin* at +`audio[t * hop_length]`. Sticking to the default behavior (centered frames) is +strongly recommended to avoid misalignment with features and annotations produced +by other common audio processing tools. + + +#### Temporal smoothing +By default CREPE does not apply temporal smoothing to the pitch curve, but +Viterbi smoothing is supported via the optional `--viterbi` command line argument. + + +#### Saving the activation matrix +The script can also optionally save the output activation matrix of the model +to an npy file (`--save-activation`), where the matrix dimensions are +(n_frames, 360) using a hop size of 10 ms (there are 360 pitch bins covering 20 +cents each). + +The script can also output a plot of the activation matrix (`--save-plot`), +saved to `audio_file.activation.png` including an optional visual representation +of the model's voicing detection (`--plot-voicing`). Here's an example plot of +the activation matrix (without the voicing overlay) for an excerpt of male +singing voice: ![salience](https://user-images.githubusercontent.com/266841/38465913-6fa085b0-3aef-11e8-9633-bdd59618ea23.png) +#### Batch processing For batch processing of files, you can provide a folder path instead of a file path: ```bash $ python crepe.py audio_folder ``` The script will process all WAV files found inside the folder. +### Additional usage information For more information on the usage, please refer to the help message: ```bash From ae4d5ab6996ee74f8dfa4c7821525de75c0a2eaa Mon Sep 17 00:00:00 2001 From: Justin Salamon Date: Thu, 10 May 2018 11:52:53 -0400 Subject: [PATCH 3/7] Set optional args in call to predict explicitly --- crepe/core.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/crepe/core.py b/crepe/core.py index c40852c..2019ef4 100644 --- a/crepe/core.py +++ b/crepe/core.py @@ -266,7 +266,8 @@ def process_file(file, output=None, viterbi=False, center=True, print("CREPE: Could not read %s" % file, file=sys.stderr) raise - time, frequency, confidence, activation = predict(audio, sr, viterbi, + time, frequency, confidence, activation = predict(audio, sr, + viterbi=viterbi, center=center) # write prediction as TSV @@ -281,7 +282,8 @@ def process_file(file, output=None, viterbi=False, center=True, if save_activation: activation_path = output_path(file, ".activation.npy", output) np.save(activation_path, activation) - print("CREPE: Saved the activation matrix at {}".format(activation_path)) + print("CREPE: Saved the activation matrix at {}".format( + activation_path)) # save the salience visualization in a PNG file if save_plot: From 361d5e178f25c932c249ac9709227f4771e9ef82 Mon Sep 17 00:00:00 2001 From: Justin Salamon Date: Thu, 10 May 2018 11:53:18 -0400 Subject: [PATCH 4/7] Add support for --no-cengtering optional arg --- crepe/cli.py | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/crepe/cli.py b/crepe/cli.py index b8a441a..a8f13ef 100644 --- a/crepe/cli.py +++ b/crepe/cli.py @@ -8,7 +8,7 @@ def run(filename, output=None, viterbi=False, save_activation=False, - save_plot=False, plot_voicing=False): + save_plot=False, plot_voicing=False, no_centering=False): """ Collect the WAV files to process and run the model @@ -31,6 +31,12 @@ def run(filename, output=None, viterbi=False, save_activation=False, Include a visual representation of the voicing activity detection in the plot of the output activation matrix. False by default, only relevant if save_plot is True. + no_centering : bool + Don't pad the signal, meaning frames will begin at their timestamp + instead of being centered around their timestamp (which is the + default). CAUTION: setting this option can result in CREPE's output + being misaligned with respect to the output of other audio processing + tools and is generally not recommended. """ @@ -59,8 +65,12 @@ def run(filename, output=None, viterbi=False, save_activation=False, for i, file in enumerate(files): print('CREPE: Processing {} ... ({}/{})'.format(file, i+1, len(files)), file=sys.stderr) - process_file(file, output, viterbi, - save_activation, save_plot, plot_voicing) + process_file(file, output=output, + viterbi=viterbi, + center=(not no_centering), + save_activation=save_activation, + save_plot=save_plot, + plot_voicing=plot_voicing) def main(): @@ -108,8 +118,21 @@ def main(): parser.add_argument('--plot-voicing', '-v', action='store_true', help='Plot the voicing prediction on top of the ' 'output activation matrix plot') + parser.add_argument('--no-centering', 'n', action='store_true', + help="Don't pad the signal, meaning frames will begin " + "at their timestamp instead of being centered " + "around their timestamp (which is the default). " + "CAUTION: setting this option can result in " + "CREPE's output being misaligned with respect to " + "the output of other audio processing tools and " + "is generally not recommended.") args = parser.parse_args() - run(args.filename, args.output, args.viterbi, - args.save_activation, args.save_plot, args.plot_voicing) + run(args.filename, + output=args.output, + viterbi=args.viterbi, + save_activation=args.save_activation, + save_plot=args.save_plot, + plot_voicing=args.plot_voicing, + no_centering=args.no_centering) From a7296cb1aeb98184ff4b0b4206b06a48323516f8 Mon Sep 17 00:00:00 2001 From: Justin Salamon Date: Thu, 10 May 2018 12:03:57 -0400 Subject: [PATCH 5/7] Bugfix --- crepe/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crepe/cli.py b/crepe/cli.py index a8f13ef..9d94bb0 100644 --- a/crepe/cli.py +++ b/crepe/cli.py @@ -118,7 +118,7 @@ def main(): parser.add_argument('--plot-voicing', '-v', action='store_true', help='Plot the voicing prediction on top of the ' 'output activation matrix plot') - parser.add_argument('--no-centering', 'n', action='store_true', + parser.add_argument('--no-centering', '-n', action='store_true', help="Don't pad the signal, meaning frames will begin " "at their timestamp instead of being centered " "around their timestamp (which is the default). " From dbcd4d7449f7126bd87d533bfe13ff5051b97904 Mon Sep 17 00:00:00 2001 From: Justin Salamon Date: Thu, 10 May 2018 12:07:27 -0400 Subject: [PATCH 6/7] Add long description for markdown display on pypi --- setup.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/setup.py b/setup.py index dda4c9e..fe8649e 100644 --- a/setup.py +++ b/setup.py @@ -20,10 +20,15 @@ version = imp.load_source('crepe.version', os.path.join('crepe', 'version.py')) +with open('README.md') as file: + long_description = file.read() + setup( name='crepe', version=version.version, description='CREPE pitch tracker', + long_description=long_description, + long_description_content_type='text/markdown', url='https://github.com/marl/crepe', author='Jong Wook Kim', author_email='jongwook@nyu.edu', From 53f65512d8970853f7d9b0a5a81e9b3db8d5e0ce Mon Sep 17 00:00:00 2001 From: Justin Salamon Date: Thu, 10 May 2018 12:20:46 -0400 Subject: [PATCH 7/7] Update readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index daf733d..802be42 100644 --- a/README.md +++ b/README.md @@ -98,7 +98,7 @@ $ python crepe.py audio_folder ``` The script will process all WAV files found inside the folder. -### Additional usage information +#### Additional usage information For more information on the usage, please refer to the help message: ```bash