зеркало из https://github.com/mozilla/kaldi.git
sandbox/pitch: various pitch-code cleanups, removing windowing config and replacing the variables we needed from there.
git-svn-id: https://svn.code.sf.net/p/kaldi/code/sandbox/pitch@3188 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
This commit is contained in:
Родитель
98db213695
Коммит
8d07c0d6bb
|
@ -41,11 +41,7 @@ static void UnitTestSimple() {
|
|||
// the parametrization object
|
||||
PitchExtractionOptions op;
|
||||
// trying to have same opts as baseline.
|
||||
op.frame_opts.dither = 0.0;
|
||||
op.frame_opts.preemph_coeff = 0.0;
|
||||
op.frame_opts.window_type = "hamming";
|
||||
op.frame_opts.remove_dc_offset = false;
|
||||
op.frame_opts.round_to_power_of_two = true;
|
||||
op.preemph_coeff = 0.0;
|
||||
// compute pitch.
|
||||
Compute(op, v, &m);
|
||||
std::cout << "Test passed :)\n\n";
|
||||
|
@ -66,11 +62,7 @@ static void UnitTestGetf0Compare1() {
|
|||
}
|
||||
// use pitch code with default configuration..
|
||||
PitchExtractionOptions op;
|
||||
op.frame_opts.dither = 0.0;
|
||||
op.frame_opts.preemph_coeff = 0.0;
|
||||
op.frame_opts.window_type = "hamming";
|
||||
op.frame_opts.remove_dc_offset = false;
|
||||
op.frame_opts.round_to_power_of_two = true;
|
||||
op.preemph_coeff = 0.0;
|
||||
// compute pitch.
|
||||
Matrix<BaseFloat> m;
|
||||
Compute(op, waveform, &m);
|
||||
|
@ -97,11 +89,8 @@ static void UnitTestGetf0CompareKeele() {
|
|||
SubVector<BaseFloat> waveform(wave.Data(), 0);
|
||||
// use pitch code with default configuration..
|
||||
PitchExtractionOptions op;
|
||||
op.frame_opts.window_type = "rectangular";
|
||||
op.frame_opts.remove_dc_offset = false;
|
||||
op.frame_opts.round_to_power_of_two = false;
|
||||
op.frame_opts.samp_freq = 8000;
|
||||
op.frame_opts.preemph_coeff = exp(-7000/op.frame_opts.samp_freq);
|
||||
op.samp_freq = 8000;
|
||||
op.preemph_coeff = exp(-7000/op.samp_freq);
|
||||
// compute pitch.
|
||||
Matrix<BaseFloat> m;
|
||||
Compute(op, waveform, &m);
|
||||
|
@ -133,12 +122,8 @@ static void UnitTestPenaltyFactor() {
|
|||
// use pitch code with default configuration..
|
||||
PitchExtractionOptions op;
|
||||
op.penalty_factor = k * 0.05;
|
||||
op.frame_opts.dither = 0.0;
|
||||
op.frame_opts.preemph_coeff = 0.0;
|
||||
op.frame_opts.window_type = "rectangular";
|
||||
op.frame_opts.remove_dc_offset = false;
|
||||
op.frame_opts.round_to_power_of_two = true;
|
||||
op.frame_opts.samp_freq = 8000;
|
||||
op.preemph_coeff = 0.0;
|
||||
op.samp_freq = 8000;
|
||||
// compute pitch.
|
||||
Matrix<BaseFloat> m;
|
||||
Compute(op, waveform, &m);
|
||||
|
@ -174,12 +159,8 @@ static void UnitTestKeeleNccfBallast() {
|
|||
SubVector<BaseFloat> waveform(wave.Data(), 0);
|
||||
// use pitch code with default configuration..
|
||||
PitchExtractionOptions op;
|
||||
op.frame_opts.dither = 0.0;
|
||||
op.frame_opts.window_type = "rectangular";
|
||||
op.frame_opts.remove_dc_offset = false;
|
||||
op.frame_opts.round_to_power_of_two = true;
|
||||
op.frame_opts.samp_freq = 8000;
|
||||
op.frame_opts.preemph_coeff = exp(-7000/op.frame_opts.samp_freq);
|
||||
op.samp_freq = 8000;
|
||||
op.preemph_coeff = exp(-7000/op.samp_freq);
|
||||
op.nccf_ballast = 0.1 * k;
|
||||
std::cout << " nccf_ballast " << op.nccf_ballast << std::endl;
|
||||
// compute pitch.
|
||||
|
@ -207,12 +188,8 @@ static void UnitTestVietnamese() {
|
|||
SubVector<BaseFloat> waveform(wave.Data(), 0);
|
||||
// use pitch code with default configuration..
|
||||
PitchExtractionOptions op;
|
||||
op.frame_opts.dither = 0.0;
|
||||
op.frame_opts.window_type = "rectangular";
|
||||
op.frame_opts.remove_dc_offset = false;
|
||||
op.frame_opts.round_to_power_of_two = false;
|
||||
op.frame_opts.samp_freq = 8000;
|
||||
op.frame_opts.preemph_coeff = exp(-7000/op.frame_opts.samp_freq);
|
||||
op.samp_freq = 8000;
|
||||
op.preemph_coeff = exp(-7000/op.samp_freq);
|
||||
// compute pitch.
|
||||
Matrix<BaseFloat> m;
|
||||
Compute(op, waveform, &m);
|
||||
|
@ -370,11 +347,8 @@ static void UnitTestPitchExtractionSpeed() {
|
|||
std::cout << "=== UnitTestPitchExtractionSpeed() ===\n";
|
||||
// use pitch code with default configuration..
|
||||
PitchExtractionOptions op_fast;
|
||||
op_fast.frame_opts.window_type = "rectangular";
|
||||
op_fast.frame_opts.remove_dc_offset = false;
|
||||
op_fast.frame_opts.round_to_power_of_two = false;
|
||||
op_fast.frame_opts.samp_freq = 8000;
|
||||
op_fast.frame_opts.preemph_coeff = exp(-7000/op_fast.frame_opts.samp_freq);
|
||||
op_fast.samp_freq = 8000;
|
||||
op_fast.preemph_coeff = exp(-7000/op_fast.samp_freq);
|
||||
op_fast.lowpass_cutoff = 1000;
|
||||
op_fast.max_f0 = 400;
|
||||
for (int32 i = 1; i < 2; i++) {
|
||||
|
@ -406,7 +380,7 @@ static void UnitTestPitchExtractionSpeed() {
|
|||
Compute(op_fast, waveform, &m);
|
||||
ftime( &tstruct );
|
||||
tend = tstruct.time * 1000 + tstruct.millitm;
|
||||
double tot_real_time = test_num * waveform.Dim() / op_fast.frame_opts.samp_freq;
|
||||
double tot_real_time = test_num * waveform.Dim() / op_fast.samp_freq;
|
||||
tot_ft = (tend - tstart)/tot_real_time;
|
||||
std::cout << " Pitch extraction time per second of speech "
|
||||
<< tot_ft << " msec " << std::endl;
|
||||
|
@ -416,11 +390,8 @@ static void UnitTestPitchExtractorCompareKeele() {
|
|||
std::cout << "=== UnitTestPitchExtractorCompareKeele() ===\n";
|
||||
// use pitch code with default configuration..
|
||||
PitchExtractionOptions op;
|
||||
op.frame_opts.window_type = "rectangular";
|
||||
op.frame_opts.remove_dc_offset = false;
|
||||
op.frame_opts.round_to_power_of_two = false;
|
||||
op.frame_opts.samp_freq = 8000;
|
||||
op.frame_opts.preemph_coeff = exp(-7000/op.frame_opts.samp_freq);
|
||||
op.samp_freq = 8000;
|
||||
op.preemph_coeff = exp(-7000/op.samp_freq);
|
||||
for (int32 i = 1; i < 11; i++) {
|
||||
std::string wavefile;
|
||||
std::string num;
|
||||
|
@ -448,11 +419,8 @@ static void UnitTestPitchExtractorCompareKeele() {
|
|||
void UnitTestDiffSampleRate() {
|
||||
int sample_rate = 16000;
|
||||
PitchExtractionOptions op_fast;
|
||||
op_fast.frame_opts.window_type = "rectangular";
|
||||
op_fast.frame_opts.remove_dc_offset = false;
|
||||
op_fast.frame_opts.round_to_power_of_two = false;
|
||||
op_fast.frame_opts.samp_freq = static_cast<double>(sample_rate);
|
||||
op_fast.frame_opts.preemph_coeff = exp(-7000/op_fast.frame_opts.samp_freq);
|
||||
op_fast.samp_freq = static_cast<double>(sample_rate);
|
||||
op_fast.preemph_coeff = exp(-7000/op_fast.samp_freq);
|
||||
op_fast.lowpass_cutoff = 1000;
|
||||
op_fast.max_f0 = 400;
|
||||
std::string samp_rate = boost::lexical_cast<std::string>(sample_rate/1000);
|
||||
|
@ -499,11 +467,8 @@ void UnitTestPostProcess() {
|
|||
KALDI_ASSERT(wave.Data().NumRows() == 1);
|
||||
SubVector<BaseFloat> waveform(wave.Data(), 0);
|
||||
PitchExtractionOptions op;
|
||||
op.frame_opts.window_type = "rectangular";
|
||||
op.frame_opts.remove_dc_offset = false;
|
||||
op.frame_opts.round_to_power_of_two = false;
|
||||
op.frame_opts.samp_freq = 8000;
|
||||
op.frame_opts.preemph_coeff = exp(-7000/op.frame_opts.samp_freq);
|
||||
op.samp_freq = 8000;
|
||||
op.preemph_coeff = exp(-7000/op.samp_freq);
|
||||
op.lowpass_cutoff = 1000;
|
||||
op.max_f0 = 400;
|
||||
Matrix<BaseFloat> m, m2;
|
||||
|
|
|
@ -158,10 +158,10 @@ void PreemphasizeFrame(VectorBase<double> *waveform, double preemph_coeff) {
|
|||
(*waveform)(0) -= preemph_coeff * (*waveform)(0);
|
||||
}
|
||||
|
||||
void ExtractFrameNccf(const VectorBase<double> &wave,
|
||||
int32 frame_num,
|
||||
const PitchExtractionOptions &opts,
|
||||
Vector<double> *window) {
|
||||
void ExtractFrame(const VectorBase<double> &wave,
|
||||
int32 frame_num,
|
||||
const PitchExtractionOptions &opts,
|
||||
Vector<double> *window) {
|
||||
|
||||
int32 frame_shift = opts.NccfWindowShift();
|
||||
int32 frame_length = opts.NccfWindowSize();
|
||||
|
@ -185,8 +185,8 @@ void ExtractFrameNccf(const VectorBase<double> &wave,
|
|||
|
||||
//if (opts.dither != 0.0) Dither(&window_part, opts.dither);
|
||||
|
||||
if (opts.frame_opts.preemph_coeff != 0.0)
|
||||
PreemphasizeFrame(&window_part, opts.frame_opts.preemph_coeff);
|
||||
if (opts.preemph_coeff != 0.0)
|
||||
PreemphasizeFrame(&window_part, opts.preemph_coeff);
|
||||
|
||||
if (end > wave.Dim())
|
||||
SubVector<double>(*window, frame_length_new-end+wave.Dim(),
|
||||
|
@ -300,7 +300,7 @@ void PreProcess(const PitchExtractionOptions opts,
|
|||
KALDI_ASSERT(processed_wave != NULL);
|
||||
// down-sample and Low-Pass filtering the input wave
|
||||
int32 num_samples_in = wave.Dim();
|
||||
double dt = opts.frame_opts.samp_freq / opts.resample_freq;
|
||||
double dt = opts.samp_freq / opts.resample_freq;
|
||||
int32 resampled_len = 1 + static_cast<int>(num_samples_in / dt);
|
||||
processed_wave->Resize(resampled_len); // filtered wave
|
||||
std::vector<double> resampled_t(resampled_len);
|
||||
|
@ -308,7 +308,7 @@ void PreProcess(const PitchExtractionOptions opts,
|
|||
resampled_t[i] = static_cast<double>(i) / opts.resample_freq;
|
||||
Matrix<double> input_wave(1, wave.Dim()), output_wave(1, resampled_len);
|
||||
input_wave.CopyRowFromVec(wave, 0);
|
||||
ArbitraryResample resample(num_samples_in, opts.frame_opts.samp_freq,
|
||||
ArbitraryResample resample(num_samples_in, opts.samp_freq,
|
||||
opts.lowpass_cutoff,
|
||||
resampled_t, opts.lowpass_filter_width);
|
||||
resample.Upsample(input_wave, &output_wave);
|
||||
|
@ -316,38 +316,39 @@ void PreProcess(const PitchExtractionOptions opts,
|
|||
|
||||
// Normalize input signal using rms
|
||||
double rms = pow(VecVec((*processed_wave),(*processed_wave))/processed_wave->Dim(), 0.5);
|
||||
(*processed_wave).Scale(1.0/rms);
|
||||
if (rms != 0.0)
|
||||
(*processed_wave).Scale(1.0/rms);
|
||||
}
|
||||
|
||||
void Nccf(const Vector<double> &wave,
|
||||
int32 start, int32 end,
|
||||
int32 Nccf_window_size,
|
||||
Vector<double> *iner_prod,
|
||||
int32 nccf_window_size,
|
||||
Vector<double> *inner_prod,
|
||||
Vector<double> *norm_prod) {
|
||||
Vector<double> zero_mean_wave(wave);
|
||||
SubVector<double> wave_part(wave, 0, Nccf_window_size);
|
||||
zero_mean_wave.Add(-wave_part.Sum()/Nccf_window_size); // subtract mean-frame from wave
|
||||
SubVector<double> wave_part(wave, 0, nccf_window_size);
|
||||
zero_mean_wave.Add(-wave_part.Sum()/nccf_window_size); // subtract mean-frame from wave
|
||||
|
||||
double e1, e2, sum;
|
||||
SubVector<double> sub_vec1(zero_mean_wave, 0, Nccf_window_size);
|
||||
SubVector<double> sub_vec1(zero_mean_wave, 0, nccf_window_size);
|
||||
e1 = VecVec(sub_vec1,sub_vec1);
|
||||
for (int32 lag = start; lag < end; lag++) {
|
||||
SubVector<double> sub_vec2(zero_mean_wave, lag, Nccf_window_size);
|
||||
SubVector<double> sub_vec2(zero_mean_wave, lag, nccf_window_size);
|
||||
e2 = VecVec(sub_vec2,sub_vec2);
|
||||
sum = VecVec(sub_vec1,sub_vec2);
|
||||
(*iner_prod)(lag-start) = sum;
|
||||
(*inner_prod)(lag-start) = sum;
|
||||
(*norm_prod)(lag-start) = e1*e2;
|
||||
}
|
||||
}
|
||||
|
||||
void ProcessNccf(const Vector<double> &iner_prod,
|
||||
void ProcessNccf(const Vector<double> &inner_prod,
|
||||
const Vector<double> &norm_prod,
|
||||
const double &a_fact,
|
||||
int32 start, int32 end,
|
||||
SubVector<double> *autocorr) {
|
||||
for (int32 lag = start; lag < end; lag++) {
|
||||
if (norm_prod(lag-start) != 0.0)
|
||||
(*autocorr)(lag) = iner_prod(lag-start) / pow(norm_prod(lag-start) + a_fact, 0.5);
|
||||
(*autocorr)(lag) = inner_prod(lag-start) / pow(norm_prod(lag-start) + a_fact, 0.5);
|
||||
KALDI_ASSERT((*autocorr)(lag) < 1.01 && (*autocorr)(lag) > -1.01);
|
||||
}
|
||||
}
|
||||
|
@ -531,19 +532,19 @@ void Compute(const PitchExtractionOptions &opts,
|
|||
Matrix<double> nccf_pitch(rows_out, num_max_lag + 1),
|
||||
nccf_pov(rows_out, num_max_lag + 1);
|
||||
for (int32 r = 0; r < rows_out; r++) { // r is frame index.
|
||||
ExtractFrameNccf( processed_wave, r,
|
||||
opts, &window);
|
||||
ExtractFrame( processed_wave, r,
|
||||
opts, &window);
|
||||
// compute nccf for pitch extraction
|
||||
a_fact = a_fact_orig;
|
||||
Vector<double> iner_prod(num_lags), norm_prod(num_lags);
|
||||
Vector<double> inner_prod(num_lags), norm_prod(num_lags);
|
||||
Nccf(window, start, end, opts.NccfWindowSize(),
|
||||
&iner_prod, &norm_prod);
|
||||
&inner_prod, &norm_prod);
|
||||
SubVector<double> nccf_pitch_vec(nccf_pitch.Row(r));
|
||||
ProcessNccf(iner_prod, norm_prod, a_fact, start, end, &(nccf_pitch_vec));
|
||||
ProcessNccf(inner_prod, norm_prod, a_fact, start, end, &(nccf_pitch_vec));
|
||||
// compute the Nccf for Probability of voicing estimation
|
||||
a_fact = pow(10,-9);
|
||||
SubVector<double> nccf_pov_vec(nccf_pov.Row(r));
|
||||
ProcessNccf(iner_prod, norm_prod, a_fact, start, end, &(nccf_pov_vec));
|
||||
ProcessNccf(inner_prod, norm_prod, a_fact, start, end, &(nccf_pov_vec));
|
||||
}
|
||||
std::vector<double> lag_vec(num_states);
|
||||
for (int32 i = 0; i < num_states; i++)
|
||||
|
|
|
@ -35,36 +35,50 @@ namespace kaldi {
|
|||
/// @{
|
||||
|
||||
struct PitchExtractionOptions {
|
||||
FrameExtractionOptions frame_opts;
|
||||
double min_f0; // min f0 to search (Hz)
|
||||
double max_f0; // max f0 to search (Hz)
|
||||
double soft_min_f0; // Minimum f0, applied in soft way, must not exceed
|
||||
// FrameExtractionOptions frame_opts;
|
||||
BaseFloat samp_freq;
|
||||
BaseFloat frame_shift_ms; // in milliseconds.
|
||||
BaseFloat frame_length_ms; // in milliseconds.
|
||||
BaseFloat preemph_coeff; // Preemphasis coefficient.
|
||||
BaseFloat min_f0; // min f0 to search (Hz)
|
||||
BaseFloat max_f0; // max f0 to search (Hz)
|
||||
BaseFloat soft_min_f0; // Minimum f0, applied in soft way, must not exceed
|
||||
// min-f0
|
||||
double penalty_factor; // cost factor for FO change
|
||||
double lowpass_cutoff; // cutoff frequency for Low pass filter
|
||||
double upsample_cutoff; // cutoff frequency we apply for upsampling Nccf
|
||||
double resample_freq; // Integer that determines filter width when upsampling NCCF
|
||||
double delta_pitch; // the pitch tolerance in pruning lags
|
||||
double nccf_ballast; // Increasing this factor reduces NCCF for quiet frames,
|
||||
BaseFloat penalty_factor; // cost factor for FO change
|
||||
BaseFloat lowpass_cutoff; // cutoff frequency for Low pass filter
|
||||
BaseFloat upsample_cutoff; // cutoff frequency we apply for upsampling Nccf
|
||||
BaseFloat resample_freq; // Integer that determines filter width when upsampling NCCF
|
||||
BaseFloat delta_pitch; // the pitch tolerance in pruning lags
|
||||
BaseFloat nccf_ballast; // Increasing this factor reduces NCCF for quiet frames,
|
||||
// helping ensure pitch continuity in unvoiced region
|
||||
int32 lowpass_filter_width; // Integer that determines filter width of
|
||||
// lowpass filter
|
||||
int32 upsample_filter_width; // Integer that determines filter width when
|
||||
// upsampling NCCF
|
||||
explicit PitchExtractionOptions() :
|
||||
min_f0(50),
|
||||
max_f0(400),
|
||||
soft_min_f0(10.0),
|
||||
penalty_factor(0.1),
|
||||
lowpass_cutoff(1500),
|
||||
upsample_cutoff(2000),
|
||||
resample_freq(4000),
|
||||
delta_pitch(0.01),
|
||||
nccf_ballast(0.625),
|
||||
lowpass_filter_width(2),
|
||||
upsample_filter_width(5) {}
|
||||
samp_freq(16000),
|
||||
frame_shift_ms(10.0),
|
||||
frame_length_ms(25.0),
|
||||
preemph_coeff(0.97),
|
||||
min_f0(50),
|
||||
max_f0(400),
|
||||
soft_min_f0(10.0),
|
||||
penalty_factor(0.1),
|
||||
lowpass_cutoff(1500),
|
||||
upsample_cutoff(2000),
|
||||
resample_freq(4000),
|
||||
delta_pitch(0.01),
|
||||
nccf_ballast(0.625),
|
||||
lowpass_filter_width(2),
|
||||
upsample_filter_width(5) {}
|
||||
void Register(ParseOptions *po) {
|
||||
frame_opts.Register(po);
|
||||
po->Register("sample-frequency", &samp_freq,
|
||||
"Waveform data sample frequency (must match the waveform file, "
|
||||
"if specified there)");
|
||||
po->Register("frame-length", &frame_length_ms, "Frame length in milliseconds");
|
||||
po->Register("frame-shift", &frame_shift_ms, "Frame shift in milliseconds");
|
||||
po->Register("preemphasis-coefficient", &preemph_coeff,
|
||||
"Coefficient for use in signal preemphasis");
|
||||
po->Register("min-f0", &min_f0,
|
||||
"min. F0 to search for (Hz)");
|
||||
po->Register("max-f0", &max_f0,
|
||||
|
@ -91,10 +105,10 @@ struct PitchExtractionOptions {
|
|||
"Integer that determines filter width when upsampling NCCF");
|
||||
}
|
||||
int32 NccfWindowSize() const {
|
||||
return static_cast<int32>(resample_freq * 0.001 * frame_opts.frame_length_ms);
|
||||
return static_cast<int32>(resample_freq * 0.001 * frame_length_ms);
|
||||
}
|
||||
int32 NccfWindowShift() const {
|
||||
return static_cast<int32>(resample_freq * 0.001 * frame_opts.frame_shift_ms);
|
||||
return static_cast<int32>(resample_freq * 0.001 * frame_shift_ms);
|
||||
}
|
||||
};
|
||||
|
||||
|
|
Загрузка…
Ссылка в новой задаче