Skip to content

Commit

Permalink
[src] Batched spectral feature extraction on GPU (#3889)
Browse files Browse the repository at this point in the history
  • Loading branch information
LeviBarnes authored Jan 29, 2020
1 parent 48d2115 commit 1121c31
Show file tree
Hide file tree
Showing 7 changed files with 1,375 additions and 5 deletions.
10 changes: 6 additions & 4 deletions src/cudafeat/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,12 @@ ifeq ($(CUDA), true)
TESTFILES =

ifeq ($(CUDA), true)
OBJFILES += feature-window-cuda.o feature-spectral-cuda.o feature-online-cmvn-cuda.o \
online-ivector-feature-cuda-kernels.o online-ivector-feature-cuda.o \
online-cuda-feature-pipeline.o feature-online-batched-cmvn-cuda.o \
feature-online-batched-cmvn-cuda-kernels.o
OBJFILES += feature-window-cuda.o feature-spectral-cuda.o \
feature-online-cmvn-cuda.o feature-online-batched-spectral-cuda.o \
feature-spectral-batched-kernels.o \
online-ivector-feature-cuda-kernels.o online-ivector-feature-cuda.o \
online-cuda-feature-pipeline.o feature-online-batched-cmvn-cuda.o \
feature-online-batched-cmvn-cuda-kernels.o
endif

LIBNAME = kaldi-cudafeat
Expand Down
256 changes: 256 additions & 0 deletions src/cudafeat/feature-online-batched-spectral-cuda.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,256 @@
// cudafeature/feature-online-batched-spectral-cuda.cc
//
// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
// Justin Luitjens, Levi Barnes
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "cudafeat/feature-online-batched-spectral-cuda.h"
#include "cudafeat/feature-spectral-batched-kernels.h"

namespace kaldi {

CudaOnlineBatchedSpectralFeatures::CudaOnlineBatchedSpectralFeatures(
const CudaSpectralFeatureOptions &opts, int32_t max_chunk_frames,
int32_t num_channels, int32_t max_lanes)
: MfccComputer(opts.mfcc_opts),
cu_lifter_coeffs_(lifter_coeffs_),
cu_dct_matrix_(dct_matrix_),
window_function_(opts.mfcc_opts.frame_opts),
max_chunk_frames_(max_chunk_frames),
num_channels_(num_channels),
max_lanes_(max_lanes) {
KALDI_ASSERT(max_chunk_frames > 0);
const MelBanks *mel_banks = GetMelBanks(1.0);
const std::vector<std::pair<int32, Vector<BaseFloat>>> &bins =
mel_banks->GetBins();
int size = bins.size();
bin_size_ = size;
std::vector<int32> offsets(size), sizes(size);
std::vector<float *> vecs(size);
cu_vecs_ = new CuVector<float>[size];
for (int i = 0; i < bins.size(); i++) {
cu_vecs_[i].Resize(bins[i].second.Dim(), kUndefined);
cu_vecs_[i].CopyFromVec(bins[i].second);
vecs[i] = cu_vecs_[i].Data();
sizes[i] = cu_vecs_[i].Dim();
offsets[i] = bins[i].first;
}
offsets_ = static_cast<int32 *>(
CuDevice::Instantiate().Malloc(size * sizeof(int32)));
sizes_ = static_cast<int32 *>(
CuDevice::Instantiate().Malloc(size * sizeof(int32)));
vecs_ = static_cast<float **>(
CuDevice::Instantiate().Malloc(size * sizeof(float *)));

CU_SAFE_CALL(cudaMemcpyAsync(vecs_, &vecs[0], size * sizeof(float *),
cudaMemcpyHostToDevice, cudaStreamPerThread));
CU_SAFE_CALL(cudaMemcpyAsync(offsets_, &offsets[0], size * sizeof(int32),
cudaMemcpyHostToDevice, cudaStreamPerThread));
CU_SAFE_CALL(cudaMemcpyAsync(sizes_, &sizes[0], size * sizeof(int32),
cudaMemcpyHostToDevice, cudaStreamPerThread));
CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));

const FrameExtractionOptions frame_opts = opts.mfcc_opts.frame_opts;
frame_length_ = frame_opts.WindowSize();
padded_length_ = frame_opts.PaddedWindowSize();
fft_length_ = padded_length_ / 2; // + 1;
fft_batch_size_ = 800;

// place holders to get strides for cufft. these will be resized correctly
// later. The +2 for cufft/fftw requirements of an extra element at the end.
// Turning off stride because cufft seems buggy with a stride
int32_t fft_num_frames =
max_chunk_frames +
(fft_batch_size_ - max_chunk_frames_ % fft_batch_size_);
cu_windows_.Resize(fft_num_frames * max_lanes_, padded_length_, kUndefined,
kStrideEqualNumCols);
//+1 matches cufft/fftw requirements
tmp_window_.Resize(fft_num_frames * max_lanes_, padded_length_ + 2,
kUndefined, kStrideEqualNumCols);

// Pre-allocated memory for power spectra
power_spectrum_.Resize(max_chunk_frames_ * max_lanes_, padded_length_ / 2 + 1,
kUndefined);
raw_log_energies_.Resize(max_lanes_, max_chunk_frames_, kUndefined);
cu_mel_energies_.Resize(max_chunk_frames_ * max_lanes_, bin_size_,
kUndefined);
int32_t max_stash_size =
2 * (frame_opts.WindowSize() / 2 + frame_opts.WindowShift());
stash_.Resize(num_channels_, max_stash_size);

stride_ = cu_windows_.Stride();
tmp_stride_ = tmp_window_.Stride();

cufftPlanMany(&plan_, 1, &padded_length_, NULL, 1, stride_, NULL, 1,
tmp_stride_ / 2, CUFFT_R2C, fft_batch_size_);
cufftSetStream(plan_, cudaStreamPerThread);
cumfcc_opts_ = opts;
}

// ExtractWindow extracts a windowed frame of waveform with a power-of-two,
// padded size. It does mean subtraction, pre-emphasis and dithering as
// requested.
void CudaOnlineBatchedSpectralFeatures::ExtractWindowsBatched(
const LaneDesc *lanes, int32_t num_lanes,
const CuMatrixBase<BaseFloat> &wave) {
CU_SAFE_CALL(cudaGetLastError());
const FrameExtractionOptions &opts = GetFrameOptions();
cuda_extract_window(
lanes, num_lanes, max_chunk_frames_, opts.WindowShift(),
opts.WindowSize(), opts.PaddedWindowSize(), opts.snip_edges, wave.Data(),
wave.Stride(), cu_windows_.Data(), opts.WindowSize(),
cu_windows_.Stride(), stash_.Data(), stash_.NumCols(), stash_.Stride());
}

void CudaOnlineBatchedSpectralFeatures::ProcessWindowsBatched(
const LaneDesc *lanes, int32_t num_lanes,
const FrameExtractionOptions &opts,
CuMatrixBase<BaseFloat> *log_energy_pre_window) {
int fft_num_frames = cu_windows_.NumRows();
KALDI_ASSERT(fft_num_frames % fft_batch_size_ == 0);

cuda_process_window(
lanes, num_lanes, max_chunk_frames_, frame_length_, opts.dither,
std::numeric_limits<float>::epsilon(), opts.remove_dc_offset,
opts.preemph_coeff, NeedRawLogEnergy(), log_energy_pre_window->Data(),
log_energy_pre_window->Stride(), window_function_.cu_window.Data(),
tmp_window_.Data(), tmp_window_.Stride(), cu_windows_.Data(),
cu_windows_.Stride());

CU_SAFE_CALL(cudaGetLastError());
}

void CudaOnlineBatchedSpectralFeatures::UpdateStashBatched(
const LaneDesc *lanes, int32_t num_lanes,
const CuMatrixBase<BaseFloat> &wave) {
KALDI_ASSERT(stash_.NumCols() < 1024);

cuda_update_stash(lanes, num_lanes, wave.Data(), wave.Stride(), stash_.Data(),
stash_.NumCols(), stash_.Stride());
}

void CudaOnlineBatchedSpectralFeatures::ComputeFinalFeaturesBatched(
const LaneDesc *lanes, int32_t num_lanes, BaseFloat vtln_wrap,
CuMatrix<BaseFloat> *cu_signal_log_energy,
CuMatrix<BaseFloat> *cu_features) {
MfccOptions mfcc_opts = cumfcc_opts_.mfcc_opts;
Vector<float> tmp;
KALDI_ASSERT(mfcc_opts.htk_compat == false);

if (num_lanes == 0) return;

if (mfcc_opts.use_energy && !mfcc_opts.raw_energy) {
cuda_dot_log(max_chunk_frames_, num_lanes, cu_windows_.NumCols(),
cu_windows_.Data(), cu_windows_.Stride(),
cu_signal_log_energy->Data(), cu_signal_log_energy->Stride());
CU_SAFE_CALL(cudaGetLastError());
}

// make sure a reallocation hasn't changed these
KALDI_ASSERT(cu_windows_.Stride() == stride_);
KALDI_ASSERT(tmp_window_.Stride() == tmp_stride_);

// Perform FFTs in batches of fft_size. This reduces memory requirements
for (int idx = 0; idx < max_chunk_frames_ * num_lanes;
idx += fft_batch_size_) {
CUFFT_SAFE_CALL(cufftExecR2C(
plan_, cu_windows_.Data() + cu_windows_.Stride() * idx,
(cufftComplex *)(tmp_window_.Data() + tmp_window_.Stride() * idx)));
}

// Compute Power spectrum
cuda_power_spectrum(max_chunk_frames_, num_lanes, padded_length_,
tmp_window_.Data(), tmp_window_.Stride(),
power_spectrum_.Data(), power_spectrum_.Stride(),
cumfcc_opts_.use_power);
CU_SAFE_CALL(cudaGetLastError());

// mel banks
int num_bins = bin_size_;
cuda_mel_banks_compute(lanes, num_lanes, max_chunk_frames_, num_bins,
std::numeric_limits<float>::epsilon(), offsets_,
sizes_, vecs_, power_spectrum_.Data(),
power_spectrum_.Stride(), cu_mel_energies_.Data(),
cu_mel_energies_.Stride(), cumfcc_opts_.use_log_fbank);
CU_SAFE_CALL(cudaGetLastError());

// dct transform
if (cumfcc_opts_.use_dct) {
if (cu_features->NumRows() > cu_mel_energies_.NumRows()) {
CuSubMatrix<BaseFloat> cu_feats_sub(*cu_features, 0,
cu_mel_energies_.NumRows(), 0,
cu_features->NumCols());
cu_feats_sub.AddMatMat(1.0, cu_mel_energies_, kNoTrans, cu_dct_matrix_,
kTrans, 0.0);
} else {
cu_features->AddMatMat(1.0, cu_mel_energies_, kNoTrans, cu_dct_matrix_,
kTrans, 0.0);
}
cuda_apply_lifter_and_floor_energy(
lanes, num_lanes, max_chunk_frames_, cu_features->NumCols(),
mfcc_opts.cepstral_lifter, mfcc_opts.use_energy, mfcc_opts.energy_floor,
cu_signal_log_energy->Data(), cu_signal_log_energy->Stride(),
cu_lifter_coeffs_.Data(), cu_features->Data(), cu_features->Stride());

} else {
cudaMemcpyAsync(cu_features->Data(), cu_mel_energies_.Data(),
sizeof(BaseFloat) * max_chunk_frames_ * num_lanes *
cu_features->Stride(),
cudaMemcpyDeviceToDevice, cudaStreamPerThread);
}
CU_SAFE_CALL(cudaGetLastError());
}

void CudaOnlineBatchedSpectralFeatures::ComputeFeaturesBatched(
const LaneDesc *lanes, int32_t n_lanes,
const CuMatrixBase<BaseFloat> &cu_wave_in, BaseFloat sample_freq,
BaseFloat vtln_warp, CuMatrix<BaseFloat> *cu_feats_out) {
// Note: cu_features is actually a rank 3 tensor.
// channels x frames x features
// it is currently represented as a matrix with n_channels*n_frames rows and
// n_features cols
const FrameExtractionOptions &frame_opts = GetFrameOptions();

if (frame_opts.dither != 0.0f) {
// Calling cu-rand directly
// CuRand class works on CuMatrixBase which must
// assume that the matrix is part of a larger matrix
// Doing this directly avoids unecessary memory copies
CURAND_SAFE_CALL(
curandGenerateNormal(GetCurandHandle(), tmp_window_.Data(),
tmp_window_.NumRows() * tmp_window_.Stride(),
0.0 /*mean*/, 1.0 /*stddev*/));
}

// Extract Windows
ExtractWindowsBatched(lanes, n_lanes, cu_wave_in);

UpdateStashBatched(lanes, n_lanes, cu_wave_in);

// Process Windows
ProcessWindowsBatched(lanes, n_lanes, frame_opts, &raw_log_energies_);

// Compute Features
ComputeFinalFeaturesBatched(lanes, n_lanes, 1.0, &raw_log_energies_,
cu_feats_out);
}

CudaOnlineBatchedSpectralFeatures::~CudaOnlineBatchedSpectralFeatures() {
delete[] cu_vecs_;
CuDevice::Instantiate().Free(vecs_);
CuDevice::Instantiate().Free(offsets_);
CuDevice::Instantiate().Free(sizes_);
cufftDestroy(plan_);
}
} // namespace kaldi
113 changes: 113 additions & 0 deletions src/cudafeat/feature-online-batched-spectral-cuda.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
// cudafeat/feature-batched-spectral-cuda.h
//
// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
// Justin Luitjens, Levi Barnes
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#ifndef KALDI_CUDAFEAT_FEATURE_BATCHED_SPECTRAL_CUDA_H_
#define KALDI_CUDAFEAT_FEATURE_BATCHED_SPECTRAL_CUDA_H_

#if HAVE_CUDA == 1
#include <cufft.h>
#endif

#include "cudafeat/feature-spectral-cuda.h"
#include "cudafeat/feature-window-cuda.h"
#include "cudafeat/lane-desc.h"
#include "cudamatrix/cu-matrix.h"
#include "cudamatrix/cu-vector.h"
#include "feat/feature-fbank.h"
#include "feat/feature-mfcc.h"

namespace kaldi {
// This class implements MFCC and Fbank computation in CUDA.
// It handles batched input.
// It takes input from device memory and outputs to
// device memory. It also does no synchronization.
class CudaOnlineBatchedSpectralFeatures : public MfccComputer {
public:
void ComputeFeatures(const CuVectorBase<BaseFloat> &cu_wave,
BaseFloat sample_freq, BaseFloat vtln_warp,
CuMatrix<BaseFloat> *cu_features) {
// Non-batched processing not allowed from
// CudaOnlineBatchedSpectralFeatures
KALDI_ASSERT(false);
}

void ComputeFeaturesBatched(const LaneDesc *lanes, int32_t n_lanes,
const CuMatrixBase<BaseFloat> &cu_wave_in,
BaseFloat sample_freq, BaseFloat vtln_warp,
CuMatrix<BaseFloat> *cu_feats_out);

CudaOnlineBatchedSpectralFeatures(const CudaSpectralFeatureOptions &opts,
int32_t max_chunk_frames,
int32_t num_channels, int32_t max_lanes);
~CudaOnlineBatchedSpectralFeatures();
CudaSpectralFeatureOptions cumfcc_opts_;
int32 Dim()
// The dimension of the output is different for MFCC and Fbank.
// This returns the appropriate value depending on the feature
// extraction algorithm
{
if (cumfcc_opts_.feature_type == MFCC) return MfccComputer::Dim();
// If we're running fbank, we need to set the dimension right
else
return cumfcc_opts_.mfcc_opts.mel_opts.num_bins +
(cumfcc_opts_.mfcc_opts.use_energy ? 1 : 0);
}

private:

void ExtractWindowsBatched(const LaneDesc *lanes, int32_t num_lanes,
const CuMatrixBase<BaseFloat> &wave);

void UpdateStashBatched(const LaneDesc *lanes, int32_t num_lanes,
const CuMatrixBase<BaseFloat> &wave);

void ProcessWindowsBatched(const LaneDesc *lanes, int32_t num_lanes,
const FrameExtractionOptions &opts,
CuMatrixBase<BaseFloat> *log_energy_pre_window);

void ComputeFinalFeaturesBatched(const LaneDesc *lanes, int32_t num_lanes,
BaseFloat vtln_wrap,
CuMatrix<BaseFloat> *cu_signal_log_energy,
CuMatrix<BaseFloat> *cu_features);

CuVector<float> cu_lifter_coeffs_;
CuMatrix<BaseFloat> cu_windows_;
CuMatrix<float> tmp_window_, cu_mel_energies_;
CuMatrix<float> cu_dct_matrix_;
CuMatrix<BaseFloat> stash_;
CuMatrix<BaseFloat> power_spectrum_;
CuMatrix<BaseFloat> raw_log_energies_;

int frame_length_, padded_length_, fft_length_, fft_batch_size_;
cufftHandle plan_;
CudaFeatureWindowFunction window_function_;

int bin_size_;
int32 *offsets_, *sizes_;
CuVector<float> *cu_vecs_;
float **vecs_;

// for sanity checking cufft
int32_t stride_, tmp_stride_;

int32_t max_chunk_frames_;
int32_t num_channels_;
int32_t max_lanes_;
};
} // namespace kaldi

#endif
Loading

0 comments on commit 1121c31

Please sign in to comment.