sandbox/pitch: various pitch-code cleanups, removing windowing config and replacing the variables we needed from there.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/sandbox/pitch@3188 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
This commit is contained in:
Dan Povey 2013-11-21 19:37:04 +00:00
Родитель 98db213695
Коммит 8d07c0d6bb
3 изменённых файлов: 82 добавлений и 102 удалений

Просмотреть файл

@ -41,11 +41,7 @@ static void UnitTestSimple() {
// the parametrization object
PitchExtractionOptions op;
// trying to have same opts as baseline.
op.frame_opts.dither = 0.0;
op.frame_opts.preemph_coeff = 0.0;
op.frame_opts.window_type = "hamming";
op.frame_opts.remove_dc_offset = false;
op.frame_opts.round_to_power_of_two = true;
op.preemph_coeff = 0.0;
// compute pitch.
Compute(op, v, &m);
std::cout << "Test passed :)\n\n";
@ -66,11 +62,7 @@ static void UnitTestGetf0Compare1() {
}
// use pitch code with default configuration..
PitchExtractionOptions op;
op.frame_opts.dither = 0.0;
op.frame_opts.preemph_coeff = 0.0;
op.frame_opts.window_type = "hamming";
op.frame_opts.remove_dc_offset = false;
op.frame_opts.round_to_power_of_two = true;
op.preemph_coeff = 0.0;
// compute pitch.
Matrix<BaseFloat> m;
Compute(op, waveform, &m);
@ -97,11 +89,8 @@ static void UnitTestGetf0CompareKeele() {
SubVector<BaseFloat> waveform(wave.Data(), 0);
// use pitch code with default configuration..
PitchExtractionOptions op;
op.frame_opts.window_type = "rectangular";
op.frame_opts.remove_dc_offset = false;
op.frame_opts.round_to_power_of_two = false;
op.frame_opts.samp_freq = 8000;
op.frame_opts.preemph_coeff = exp(-7000/op.frame_opts.samp_freq);
op.samp_freq = 8000;
op.preemph_coeff = exp(-7000/op.samp_freq);
// compute pitch.
Matrix<BaseFloat> m;
Compute(op, waveform, &m);
@ -133,12 +122,8 @@ static void UnitTestPenaltyFactor() {
// use pitch code with default configuration..
PitchExtractionOptions op;
op.penalty_factor = k * 0.05;
op.frame_opts.dither = 0.0;
op.frame_opts.preemph_coeff = 0.0;
op.frame_opts.window_type = "rectangular";
op.frame_opts.remove_dc_offset = false;
op.frame_opts.round_to_power_of_two = true;
op.frame_opts.samp_freq = 8000;
op.preemph_coeff = 0.0;
op.samp_freq = 8000;
// compute pitch.
Matrix<BaseFloat> m;
Compute(op, waveform, &m);
@ -174,12 +159,8 @@ static void UnitTestKeeleNccfBallast() {
SubVector<BaseFloat> waveform(wave.Data(), 0);
// use pitch code with default configuration..
PitchExtractionOptions op;
op.frame_opts.dither = 0.0;
op.frame_opts.window_type = "rectangular";
op.frame_opts.remove_dc_offset = false;
op.frame_opts.round_to_power_of_two = true;
op.frame_opts.samp_freq = 8000;
op.frame_opts.preemph_coeff = exp(-7000/op.frame_opts.samp_freq);
op.samp_freq = 8000;
op.preemph_coeff = exp(-7000/op.samp_freq);
op.nccf_ballast = 0.1 * k;
std::cout << " nccf_ballast " << op.nccf_ballast << std::endl;
// compute pitch.
@ -207,12 +188,8 @@ static void UnitTestVietnamese() {
SubVector<BaseFloat> waveform(wave.Data(), 0);
// use pitch code with default configuration..
PitchExtractionOptions op;
op.frame_opts.dither = 0.0;
op.frame_opts.window_type = "rectangular";
op.frame_opts.remove_dc_offset = false;
op.frame_opts.round_to_power_of_two = false;
op.frame_opts.samp_freq = 8000;
op.frame_opts.preemph_coeff = exp(-7000/op.frame_opts.samp_freq);
op.samp_freq = 8000;
op.preemph_coeff = exp(-7000/op.samp_freq);
// compute pitch.
Matrix<BaseFloat> m;
Compute(op, waveform, &m);
@ -370,11 +347,8 @@ static void UnitTestPitchExtractionSpeed() {
std::cout << "=== UnitTestPitchExtractionSpeed() ===\n";
// use pitch code with default configuration..
PitchExtractionOptions op_fast;
op_fast.frame_opts.window_type = "rectangular";
op_fast.frame_opts.remove_dc_offset = false;
op_fast.frame_opts.round_to_power_of_two = false;
op_fast.frame_opts.samp_freq = 8000;
op_fast.frame_opts.preemph_coeff = exp(-7000/op_fast.frame_opts.samp_freq);
op_fast.samp_freq = 8000;
op_fast.preemph_coeff = exp(-7000/op_fast.samp_freq);
op_fast.lowpass_cutoff = 1000;
op_fast.max_f0 = 400;
for (int32 i = 1; i < 2; i++) {
@ -406,7 +380,7 @@ static void UnitTestPitchExtractionSpeed() {
Compute(op_fast, waveform, &m);
ftime( &tstruct );
tend = tstruct.time * 1000 + tstruct.millitm;
double tot_real_time = test_num * waveform.Dim() / op_fast.frame_opts.samp_freq;
double tot_real_time = test_num * waveform.Dim() / op_fast.samp_freq;
tot_ft = (tend - tstart)/tot_real_time;
std::cout << " Pitch extraction time per second of speech "
<< tot_ft << " msec " << std::endl;
@ -416,11 +390,8 @@ static void UnitTestPitchExtractorCompareKeele() {
std::cout << "=== UnitTestPitchExtractorCompareKeele() ===\n";
// use pitch code with default configuration..
PitchExtractionOptions op;
op.frame_opts.window_type = "rectangular";
op.frame_opts.remove_dc_offset = false;
op.frame_opts.round_to_power_of_two = false;
op.frame_opts.samp_freq = 8000;
op.frame_opts.preemph_coeff = exp(-7000/op.frame_opts.samp_freq);
op.samp_freq = 8000;
op.preemph_coeff = exp(-7000/op.samp_freq);
for (int32 i = 1; i < 11; i++) {
std::string wavefile;
std::string num;
@ -448,11 +419,8 @@ static void UnitTestPitchExtractorCompareKeele() {
void UnitTestDiffSampleRate() {
int sample_rate = 16000;
PitchExtractionOptions op_fast;
op_fast.frame_opts.window_type = "rectangular";
op_fast.frame_opts.remove_dc_offset = false;
op_fast.frame_opts.round_to_power_of_two = false;
op_fast.frame_opts.samp_freq = static_cast<double>(sample_rate);
op_fast.frame_opts.preemph_coeff = exp(-7000/op_fast.frame_opts.samp_freq);
op_fast.samp_freq = static_cast<double>(sample_rate);
op_fast.preemph_coeff = exp(-7000/op_fast.samp_freq);
op_fast.lowpass_cutoff = 1000;
op_fast.max_f0 = 400;
std::string samp_rate = boost::lexical_cast<std::string>(sample_rate/1000);
@ -499,11 +467,8 @@ void UnitTestPostProcess() {
KALDI_ASSERT(wave.Data().NumRows() == 1);
SubVector<BaseFloat> waveform(wave.Data(), 0);
PitchExtractionOptions op;
op.frame_opts.window_type = "rectangular";
op.frame_opts.remove_dc_offset = false;
op.frame_opts.round_to_power_of_two = false;
op.frame_opts.samp_freq = 8000;
op.frame_opts.preemph_coeff = exp(-7000/op.frame_opts.samp_freq);
op.samp_freq = 8000;
op.preemph_coeff = exp(-7000/op.samp_freq);
op.lowpass_cutoff = 1000;
op.max_f0 = 400;
Matrix<BaseFloat> m, m2;

Просмотреть файл

@ -158,10 +158,10 @@ void PreemphasizeFrame(VectorBase<double> *waveform, double preemph_coeff) {
(*waveform)(0) -= preemph_coeff * (*waveform)(0);
}
void ExtractFrameNccf(const VectorBase<double> &wave,
int32 frame_num,
const PitchExtractionOptions &opts,
Vector<double> *window) {
void ExtractFrame(const VectorBase<double> &wave,
int32 frame_num,
const PitchExtractionOptions &opts,
Vector<double> *window) {
int32 frame_shift = opts.NccfWindowShift();
int32 frame_length = opts.NccfWindowSize();
@ -185,8 +185,8 @@ void ExtractFrameNccf(const VectorBase<double> &wave,
//if (opts.dither != 0.0) Dither(&window_part, opts.dither);
if (opts.frame_opts.preemph_coeff != 0.0)
PreemphasizeFrame(&window_part, opts.frame_opts.preemph_coeff);
if (opts.preemph_coeff != 0.0)
PreemphasizeFrame(&window_part, opts.preemph_coeff);
if (end > wave.Dim())
SubVector<double>(*window, frame_length_new-end+wave.Dim(),
@ -300,7 +300,7 @@ void PreProcess(const PitchExtractionOptions opts,
KALDI_ASSERT(processed_wave != NULL);
// down-sample and Low-Pass filtering the input wave
int32 num_samples_in = wave.Dim();
double dt = opts.frame_opts.samp_freq / opts.resample_freq;
double dt = opts.samp_freq / opts.resample_freq;
int32 resampled_len = 1 + static_cast<int>(num_samples_in / dt);
processed_wave->Resize(resampled_len); // filtered wave
std::vector<double> resampled_t(resampled_len);
@ -308,7 +308,7 @@ void PreProcess(const PitchExtractionOptions opts,
resampled_t[i] = static_cast<double>(i) / opts.resample_freq;
Matrix<double> input_wave(1, wave.Dim()), output_wave(1, resampled_len);
input_wave.CopyRowFromVec(wave, 0);
ArbitraryResample resample(num_samples_in, opts.frame_opts.samp_freq,
ArbitraryResample resample(num_samples_in, opts.samp_freq,
opts.lowpass_cutoff,
resampled_t, opts.lowpass_filter_width);
resample.Upsample(input_wave, &output_wave);
@ -316,38 +316,39 @@ void PreProcess(const PitchExtractionOptions opts,
// Normalize input signal using rms
double rms = pow(VecVec((*processed_wave),(*processed_wave))/processed_wave->Dim(), 0.5);
(*processed_wave).Scale(1.0/rms);
if (rms != 0.0)
(*processed_wave).Scale(1.0/rms);
}
void Nccf(const Vector<double> &wave,
int32 start, int32 end,
int32 Nccf_window_size,
Vector<double> *iner_prod,
int32 nccf_window_size,
Vector<double> *inner_prod,
Vector<double> *norm_prod) {
Vector<double> zero_mean_wave(wave);
SubVector<double> wave_part(wave, 0, Nccf_window_size);
zero_mean_wave.Add(-wave_part.Sum()/Nccf_window_size); // subtract mean-frame from wave
SubVector<double> wave_part(wave, 0, nccf_window_size);
zero_mean_wave.Add(-wave_part.Sum()/nccf_window_size); // subtract mean-frame from wave
double e1, e2, sum;
SubVector<double> sub_vec1(zero_mean_wave, 0, Nccf_window_size);
SubVector<double> sub_vec1(zero_mean_wave, 0, nccf_window_size);
e1 = VecVec(sub_vec1,sub_vec1);
for (int32 lag = start; lag < end; lag++) {
SubVector<double> sub_vec2(zero_mean_wave, lag, Nccf_window_size);
SubVector<double> sub_vec2(zero_mean_wave, lag, nccf_window_size);
e2 = VecVec(sub_vec2,sub_vec2);
sum = VecVec(sub_vec1,sub_vec2);
(*iner_prod)(lag-start) = sum;
(*inner_prod)(lag-start) = sum;
(*norm_prod)(lag-start) = e1*e2;
}
}
void ProcessNccf(const Vector<double> &iner_prod,
void ProcessNccf(const Vector<double> &inner_prod,
const Vector<double> &norm_prod,
const double &a_fact,
int32 start, int32 end,
SubVector<double> *autocorr) {
for (int32 lag = start; lag < end; lag++) {
if (norm_prod(lag-start) != 0.0)
(*autocorr)(lag) = iner_prod(lag-start) / pow(norm_prod(lag-start) + a_fact, 0.5);
(*autocorr)(lag) = inner_prod(lag-start) / pow(norm_prod(lag-start) + a_fact, 0.5);
KALDI_ASSERT((*autocorr)(lag) < 1.01 && (*autocorr)(lag) > -1.01);
}
}
@ -531,19 +532,19 @@ void Compute(const PitchExtractionOptions &opts,
Matrix<double> nccf_pitch(rows_out, num_max_lag + 1),
nccf_pov(rows_out, num_max_lag + 1);
for (int32 r = 0; r < rows_out; r++) { // r is frame index.
ExtractFrameNccf( processed_wave, r,
opts, &window);
ExtractFrame( processed_wave, r,
opts, &window);
// compute nccf for pitch extraction
a_fact = a_fact_orig;
Vector<double> iner_prod(num_lags), norm_prod(num_lags);
Vector<double> inner_prod(num_lags), norm_prod(num_lags);
Nccf(window, start, end, opts.NccfWindowSize(),
&iner_prod, &norm_prod);
&inner_prod, &norm_prod);
SubVector<double> nccf_pitch_vec(nccf_pitch.Row(r));
ProcessNccf(iner_prod, norm_prod, a_fact, start, end, &(nccf_pitch_vec));
ProcessNccf(inner_prod, norm_prod, a_fact, start, end, &(nccf_pitch_vec));
// compute the Nccf for Probability of voicing estimation
a_fact = pow(10,-9);
SubVector<double> nccf_pov_vec(nccf_pov.Row(r));
ProcessNccf(iner_prod, norm_prod, a_fact, start, end, &(nccf_pov_vec));
ProcessNccf(inner_prod, norm_prod, a_fact, start, end, &(nccf_pov_vec));
}
std::vector<double> lag_vec(num_states);
for (int32 i = 0; i < num_states; i++)

Просмотреть файл

@ -35,36 +35,50 @@ namespace kaldi {
/// @{
struct PitchExtractionOptions {
FrameExtractionOptions frame_opts;
double min_f0; // min f0 to search (Hz)
double max_f0; // max f0 to search (Hz)
double soft_min_f0; // Minimum f0, applied in soft way, must not exceed
// FrameExtractionOptions frame_opts;
BaseFloat samp_freq;
BaseFloat frame_shift_ms; // in milliseconds.
BaseFloat frame_length_ms; // in milliseconds.
BaseFloat preemph_coeff; // Preemphasis coefficient.
BaseFloat min_f0; // min f0 to search (Hz)
BaseFloat max_f0; // max f0 to search (Hz)
BaseFloat soft_min_f0; // Minimum f0, applied in soft way, must not exceed
// min-f0
double penalty_factor; // cost factor for FO change
double lowpass_cutoff; // cutoff frequency for Low pass filter
double upsample_cutoff; // cutoff frequency we apply for upsampling Nccf
double resample_freq; // Integer that determines filter width when upsampling NCCF
double delta_pitch; // the pitch tolerance in pruning lags
double nccf_ballast; // Increasing this factor reduces NCCF for quiet frames,
BaseFloat penalty_factor; // cost factor for FO change
BaseFloat lowpass_cutoff; // cutoff frequency for Low pass filter
BaseFloat upsample_cutoff; // cutoff frequency we apply for upsampling Nccf
BaseFloat resample_freq; // Integer that determines filter width when upsampling NCCF
BaseFloat delta_pitch; // the pitch tolerance in pruning lags
BaseFloat nccf_ballast; // Increasing this factor reduces NCCF for quiet frames,
// helping ensure pitch continuity in unvoiced region
int32 lowpass_filter_width; // Integer that determines filter width of
// lowpass filter
int32 upsample_filter_width; // Integer that determines filter width when
// upsampling NCCF
explicit PitchExtractionOptions() :
min_f0(50),
max_f0(400),
soft_min_f0(10.0),
penalty_factor(0.1),
lowpass_cutoff(1500),
upsample_cutoff(2000),
resample_freq(4000),
delta_pitch(0.01),
nccf_ballast(0.625),
lowpass_filter_width(2),
upsample_filter_width(5) {}
samp_freq(16000),
frame_shift_ms(10.0),
frame_length_ms(25.0),
preemph_coeff(0.97),
min_f0(50),
max_f0(400),
soft_min_f0(10.0),
penalty_factor(0.1),
lowpass_cutoff(1500),
upsample_cutoff(2000),
resample_freq(4000),
delta_pitch(0.01),
nccf_ballast(0.625),
lowpass_filter_width(2),
upsample_filter_width(5) {}
void Register(ParseOptions *po) {
frame_opts.Register(po);
po->Register("sample-frequency", &samp_freq,
"Waveform data sample frequency (must match the waveform file, "
"if specified there)");
po->Register("frame-length", &frame_length_ms, "Frame length in milliseconds");
po->Register("frame-shift", &frame_shift_ms, "Frame shift in milliseconds");
po->Register("preemphasis-coefficient", &preemph_coeff,
"Coefficient for use in signal preemphasis");
po->Register("min-f0", &min_f0,
"min. F0 to search for (Hz)");
po->Register("max-f0", &max_f0,
@ -91,10 +105,10 @@ struct PitchExtractionOptions {
"Integer that determines filter width when upsampling NCCF");
}
int32 NccfWindowSize() const {
return static_cast<int32>(resample_freq * 0.001 * frame_opts.frame_length_ms);
return static_cast<int32>(resample_freq * 0.001 * frame_length_ms);
}
int32 NccfWindowShift() const {
return static_cast<int32>(resample_freq * 0.001 * frame_opts.frame_shift_ms);
return static_cast<int32>(resample_freq * 0.001 * frame_shift_ms);
}
};