sandbox/pitch: various pitch-code cleanups, removing windowing config and replacing the variables we needed from there.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/sandbox/pitch@3188 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
2013-11-21 19:37:04 +00:00 · 2013-11-21 19:37:04 +00:00 · 8d07c0d6bb
--- a/src/feat/pitch-functions-test.cc
+++ b/src/feat/pitch-functions-test.cc
@ -41,11 +41,7 @@ static void UnitTestSimple() {
  // the parametrization object
  PitchExtractionOptions op;
  // trying to have same opts as baseline.
-  op.frame_opts.dither = 0.0;
-  op.frame_opts.preemph_coeff = 0.0;
-  op.frame_opts.window_type = "hamming";
-  op.frame_opts.remove_dc_offset = false;
-  op.frame_opts.round_to_power_of_two = true;
+  op.preemph_coeff = 0.0;
  // compute pitch.
  Compute(op, v, &m);
  std::cout << "Test passed :)\n\n";
@ -66,11 +62,7 @@ static void UnitTestGetf0Compare1() {
  }
  // use pitch code with default configuration..
  PitchExtractionOptions op;  
-  op.frame_opts.dither = 0.0;
-  op.frame_opts.preemph_coeff = 0.0;
-  op.frame_opts.window_type = "hamming";
-  op.frame_opts.remove_dc_offset = false;
-  op.frame_opts.round_to_power_of_two = true;
+  op.preemph_coeff = 0.0;
  // compute pitch.
  Matrix<BaseFloat> m;
  Compute(op, waveform, &m);    
@ -97,11 +89,8 @@ static void UnitTestGetf0CompareKeele() {
    SubVector<BaseFloat> waveform(wave.Data(), 0); 
    // use pitch code with default configuration..
    PitchExtractionOptions op;
-    op.frame_opts.window_type = "rectangular";
-    op.frame_opts.remove_dc_offset = false;
-    op.frame_opts.round_to_power_of_two = false;
-    op.frame_opts.samp_freq = 8000;
-    op.frame_opts.preemph_coeff = exp(-7000/op.frame_opts.samp_freq);
+    op.samp_freq = 8000;
+    op.preemph_coeff = exp(-7000/op.samp_freq);
    // compute pitch.
    Matrix<BaseFloat> m;
    Compute(op, waveform, &m);    
@ -133,12 +122,8 @@ static void UnitTestPenaltyFactor() {
      // use pitch code with default configuration..
      PitchExtractionOptions op;  
      op.penalty_factor = k * 0.05;
-      op.frame_opts.dither = 0.0;
-      op.frame_opts.preemph_coeff = 0.0;
-      op.frame_opts.window_type = "rectangular";
-      op.frame_opts.remove_dc_offset = false;
-      op.frame_opts.round_to_power_of_two = true;
-      op.frame_opts.samp_freq = 8000;
+      op.preemph_coeff = 0.0;
+      op.samp_freq = 8000;
      // compute pitch.
      Matrix<BaseFloat> m;
      Compute(op, waveform, &m);    
@ -174,12 +159,8 @@ static void UnitTestKeeleNccfBallast() {
      SubVector<BaseFloat> waveform(wave.Data(), 0); 
      // use pitch code with default configuration..
      PitchExtractionOptions op;  
-      op.frame_opts.dither = 0.0;
-      op.frame_opts.window_type = "rectangular";
-      op.frame_opts.remove_dc_offset = false;
-      op.frame_opts.round_to_power_of_two = true;
-      op.frame_opts.samp_freq = 8000;
-      op.frame_opts.preemph_coeff = exp(-7000/op.frame_opts.samp_freq);
+      op.samp_freq = 8000;
+      op.preemph_coeff = exp(-7000/op.samp_freq);
      op.nccf_ballast = 0.1 * k;  
      std::cout << " nccf_ballast " << op.nccf_ballast << std::endl;
      // compute pitch.
@ -207,12 +188,8 @@ static void UnitTestVietnamese() {
  SubVector<BaseFloat> waveform(wave.Data(), 0); 
  // use pitch code with default configuration..
  PitchExtractionOptions op;
-  op.frame_opts.dither = 0.0;
-  op.frame_opts.window_type = "rectangular";
-  op.frame_opts.remove_dc_offset = false;
-  op.frame_opts.round_to_power_of_two = false;
-  op.frame_opts.samp_freq = 8000;
-  op.frame_opts.preemph_coeff = exp(-7000/op.frame_opts.samp_freq);
+  op.samp_freq = 8000;
+  op.preemph_coeff = exp(-7000/op.samp_freq);
  // compute pitch.
  Matrix<BaseFloat> m;
  Compute(op, waveform, &m);    
@ -370,11 +347,8 @@ static void UnitTestPitchExtractionSpeed() {
  std::cout << "=== UnitTestPitchExtractionSpeed() ===\n";
  // use pitch code with default configuration..
  PitchExtractionOptions op_fast;
-  op_fast.frame_opts.window_type = "rectangular";
-  op_fast.frame_opts.remove_dc_offset = false;
-  op_fast.frame_opts.round_to_power_of_two = false;
-  op_fast.frame_opts.samp_freq = 8000;
-  op_fast.frame_opts.preemph_coeff = exp(-7000/op_fast.frame_opts.samp_freq);
+  op_fast.samp_freq = 8000;
+  op_fast.preemph_coeff = exp(-7000/op_fast.samp_freq);
  op_fast.lowpass_cutoff = 1000;
  op_fast.max_f0 = 400; 
  for (int32 i = 1; i < 2; i++) {
@ -406,7 +380,7 @@ static void UnitTestPitchExtractionSpeed() {
      Compute(op_fast, waveform, &m);
    ftime( &tstruct );
    tend = tstruct.time * 1000 + tstruct.millitm;
-    double tot_real_time = test_num * waveform.Dim() / op_fast.frame_opts.samp_freq;
+    double tot_real_time = test_num * waveform.Dim() / op_fast.samp_freq;
    tot_ft = (tend - tstart)/tot_real_time;
    std::cout << " Pitch extraction time per second of speech " 
              << tot_ft << " msec " << std::endl;
@ -416,11 +390,8 @@ static void UnitTestPitchExtractorCompareKeele() {
  std::cout << "=== UnitTestPitchExtractorCompareKeele() ===\n";
  // use pitch code with default configuration..
  PitchExtractionOptions op;
-  op.frame_opts.window_type = "rectangular";
-  op.frame_opts.remove_dc_offset = false;
-  op.frame_opts.round_to_power_of_two = false;
-  op.frame_opts.samp_freq = 8000;
-  op.frame_opts.preemph_coeff = exp(-7000/op.frame_opts.samp_freq);
+  op.samp_freq = 8000;
+  op.preemph_coeff = exp(-7000/op.samp_freq);
  for (int32 i = 1; i < 11; i++) {
    std::string wavefile;
    std::string num;
@ -448,11 +419,8 @@ static void UnitTestPitchExtractorCompareKeele() {
 void UnitTestDiffSampleRate() {
  int sample_rate = 16000; 
  PitchExtractionOptions op_fast;
-  op_fast.frame_opts.window_type = "rectangular";
-  op_fast.frame_opts.remove_dc_offset = false;
-  op_fast.frame_opts.round_to_power_of_two = false;
-  op_fast.frame_opts.samp_freq = static_cast<double>(sample_rate);
-  op_fast.frame_opts.preemph_coeff = exp(-7000/op_fast.frame_opts.samp_freq);
+  op_fast.samp_freq = static_cast<double>(sample_rate);
+  op_fast.preemph_coeff = exp(-7000/op_fast.samp_freq);
  op_fast.lowpass_cutoff = 1000;
  op_fast.max_f0 = 400;
  std::string samp_rate = boost::lexical_cast<std::string>(sample_rate/1000);
@ -499,11 +467,8 @@ void UnitTestPostProcess() {
    KALDI_ASSERT(wave.Data().NumRows() == 1);       
    SubVector<BaseFloat> waveform(wave.Data(), 0);
    PitchExtractionOptions op;
-    op.frame_opts.window_type = "rectangular";
-    op.frame_opts.remove_dc_offset = false;
-    op.frame_opts.round_to_power_of_two = false;
-    op.frame_opts.samp_freq = 8000;
-    op.frame_opts.preemph_coeff = exp(-7000/op.frame_opts.samp_freq);
+    op.samp_freq = 8000;
+    op.preemph_coeff = exp(-7000/op.samp_freq);
    op.lowpass_cutoff = 1000;
    op.max_f0 = 400;
    Matrix<BaseFloat> m, m2;
--- a/src/feat/pitch-functions.cc
+++ b/src/feat/pitch-functions.cc
@ -158,10 +158,10 @@ void PreemphasizeFrame(VectorBase<double> *waveform, double preemph_coeff) {
  (*waveform)(0) -= preemph_coeff * (*waveform)(0);
 }

-void ExtractFrameNccf(const VectorBase<double> &wave,
-                      int32 frame_num,
-                      const PitchExtractionOptions &opts,
-                      Vector<double> *window) {
+void ExtractFrame(const VectorBase<double> &wave,
+                  int32 frame_num,
+                  const PitchExtractionOptions &opts,
+                  Vector<double> *window) {

  int32 frame_shift = opts.NccfWindowShift();
  int32 frame_length = opts.NccfWindowSize();
@ -185,8 +185,8 @@ void ExtractFrameNccf(const VectorBase<double> &wave,

  //if (opts.dither != 0.0) Dither(&window_part, opts.dither);

-  if (opts.frame_opts.preemph_coeff != 0.0)
-    PreemphasizeFrame(&window_part, opts.frame_opts.preemph_coeff);
+  if (opts.preemph_coeff != 0.0)
+    PreemphasizeFrame(&window_part, opts.preemph_coeff);

  if (end > wave.Dim())
    SubVector<double>(*window, frame_length_new-end+wave.Dim(),
@ -300,7 +300,7 @@ void PreProcess(const PitchExtractionOptions opts,
  KALDI_ASSERT(processed_wave != NULL);
  // down-sample and Low-Pass filtering the input wave
  int32 num_samples_in = wave.Dim();
-  double dt = opts.frame_opts.samp_freq / opts.resample_freq;
+  double dt = opts.samp_freq / opts.resample_freq;
  int32 resampled_len = 1 + static_cast<int>(num_samples_in / dt);
  processed_wave->Resize(resampled_len); // filtered wave
  std::vector<double> resampled_t(resampled_len);
@ -308,7 +308,7 @@ void PreProcess(const PitchExtractionOptions opts,
    resampled_t[i] = static_cast<double>(i) / opts.resample_freq;
  Matrix<double> input_wave(1, wave.Dim()), output_wave(1, resampled_len);
  input_wave.CopyRowFromVec(wave, 0);
-  ArbitraryResample resample(num_samples_in, opts.frame_opts.samp_freq,
+  ArbitraryResample resample(num_samples_in, opts.samp_freq,
                             opts.lowpass_cutoff,
                             resampled_t, opts.lowpass_filter_width);
  resample.Upsample(input_wave, &output_wave);
@ -316,38 +316,39 @@ void PreProcess(const PitchExtractionOptions opts,

  // Normalize input signal using rms
  double rms = pow(VecVec((*processed_wave),(*processed_wave))/processed_wave->Dim(), 0.5);
-  (*processed_wave).Scale(1.0/rms);
+  if (rms != 0.0)
+    (*processed_wave).Scale(1.0/rms);
 }

 void Nccf(const Vector<double> &wave,
          int32 start, int32 end,
-          int32 Nccf_window_size,
-          Vector<double> *iner_prod,
+          int32 nccf_window_size,
+          Vector<double> *inner_prod,
          Vector<double> *norm_prod) {
  Vector<double> zero_mean_wave(wave);
-  SubVector<double> wave_part(wave, 0, Nccf_window_size);
-  zero_mean_wave.Add(-wave_part.Sum()/Nccf_window_size); // subtract mean-frame from wave
+  SubVector<double> wave_part(wave, 0, nccf_window_size);
+  zero_mean_wave.Add(-wave_part.Sum()/nccf_window_size); // subtract mean-frame from wave

  double e1, e2, sum;
-  SubVector<double> sub_vec1(zero_mean_wave, 0, Nccf_window_size);
+  SubVector<double> sub_vec1(zero_mean_wave, 0, nccf_window_size);
  e1 = VecVec(sub_vec1,sub_vec1);
  for (int32 lag = start; lag < end; lag++) {
-    SubVector<double> sub_vec2(zero_mean_wave, lag, Nccf_window_size);
+    SubVector<double> sub_vec2(zero_mean_wave, lag, nccf_window_size);
    e2 = VecVec(sub_vec2,sub_vec2);
    sum = VecVec(sub_vec1,sub_vec2);
-    (*iner_prod)(lag-start) = sum;
+    (*inner_prod)(lag-start) = sum;
    (*norm_prod)(lag-start) = e1*e2;
  }
 }

-void ProcessNccf(const Vector<double> &iner_prod,
+void ProcessNccf(const Vector<double> &inner_prod,
                 const Vector<double> &norm_prod,
                 const double &a_fact,
                 int32 start, int32 end,
                 SubVector<double> *autocorr) {
  for (int32 lag = start; lag < end; lag++) {
    if (norm_prod(lag-start) != 0.0)
-      (*autocorr)(lag) = iner_prod(lag-start) / pow(norm_prod(lag-start) + a_fact, 0.5);
+      (*autocorr)(lag) = inner_prod(lag-start) / pow(norm_prod(lag-start) + a_fact, 0.5);
    KALDI_ASSERT((*autocorr)(lag) < 1.01 && (*autocorr)(lag) > -1.01);
  }
 }
@ -531,19 +532,19 @@ void Compute(const PitchExtractionOptions &opts,
  Matrix<double> nccf_pitch(rows_out, num_max_lag + 1),
      nccf_pov(rows_out, num_max_lag + 1);
  for (int32 r = 0; r < rows_out; r++) {  // r is frame index.
-    ExtractFrameNccf( processed_wave, r,
-                      opts, &window);
+    ExtractFrame( processed_wave, r,
+                  opts, &window);
    // compute nccf for pitch extraction
    a_fact = a_fact_orig;
-    Vector<double> iner_prod(num_lags), norm_prod(num_lags);
+    Vector<double> inner_prod(num_lags), norm_prod(num_lags);
    Nccf(window, start, end, opts.NccfWindowSize(),
-         &iner_prod, &norm_prod);
+         &inner_prod, &norm_prod);
    SubVector<double> nccf_pitch_vec(nccf_pitch.Row(r));
-    ProcessNccf(iner_prod, norm_prod, a_fact, start, end, &(nccf_pitch_vec));
+    ProcessNccf(inner_prod, norm_prod, a_fact, start, end, &(nccf_pitch_vec));
    // compute the Nccf for Probability of voicing estimation
    a_fact = pow(10,-9);
    SubVector<double> nccf_pov_vec(nccf_pov.Row(r));
-    ProcessNccf(iner_prod, norm_prod, a_fact, start, end, &(nccf_pov_vec));
+    ProcessNccf(inner_prod, norm_prod, a_fact, start, end, &(nccf_pov_vec));
  }
  std::vector<double> lag_vec(num_states);
  for (int32 i = 0; i < num_states; i++)
--- a/src/feat/pitch-functions.h
+++ b/src/feat/pitch-functions.h
@ -35,36 +35,50 @@ namespace kaldi {
 /// @{

 struct PitchExtractionOptions {
-  FrameExtractionOptions frame_opts;
-  double min_f0;          // min f0 to search (Hz)
-  double max_f0;          // max f0 to search (Hz)
-  double soft_min_f0;     // Minimum f0, applied in soft way, must not exceed
+  // FrameExtractionOptions frame_opts;
+  BaseFloat samp_freq;
+  BaseFloat frame_shift_ms;  // in milliseconds.
+  BaseFloat frame_length_ms;  // in milliseconds.
+  BaseFloat preemph_coeff;  // Preemphasis coefficient.  
+  BaseFloat min_f0;          // min f0 to search (Hz)
+  BaseFloat max_f0;          // max f0 to search (Hz)
+  BaseFloat soft_min_f0;     // Minimum f0, applied in soft way, must not exceed
                          // min-f0
-  double penalty_factor;  // cost factor for FO change
-  double lowpass_cutoff;  // cutoff frequency for Low pass filter
-  double upsample_cutoff; // cutoff frequency we apply for upsampling Nccf
-  double resample_freq;   // Integer that determines filter width when upsampling NCCF
-  double delta_pitch;     // the pitch tolerance in pruning lags
-  double nccf_ballast;    // Increasing this factor reduces NCCF for quiet frames,
+  BaseFloat penalty_factor;  // cost factor for FO change
+  BaseFloat lowpass_cutoff;  // cutoff frequency for Low pass filter
+  BaseFloat upsample_cutoff; // cutoff frequency we apply for upsampling Nccf
+  BaseFloat resample_freq;   // Integer that determines filter width when upsampling NCCF
+  BaseFloat delta_pitch;     // the pitch tolerance in pruning lags
+  BaseFloat nccf_ballast;    // Increasing this factor reduces NCCF for quiet frames,
                          // helping ensure pitch continuity in unvoiced region
  int32 lowpass_filter_width;   // Integer that determines filter width of
                                // lowpass filter
  int32 upsample_filter_width;  // Integer that determines filter width when
                                // upsampling NCCF
  explicit PitchExtractionOptions() :
-    min_f0(50),
-    max_f0(400),
-    soft_min_f0(10.0),
-    penalty_factor(0.1),
-    lowpass_cutoff(1500),
-    upsample_cutoff(2000),
-    resample_freq(4000),
-    delta_pitch(0.01),
-    nccf_ballast(0.625),
-    lowpass_filter_width(2),
-    upsample_filter_width(5) {}
+      samp_freq(16000),
+      frame_shift_ms(10.0),
+      frame_length_ms(25.0),
+      preemph_coeff(0.97),
+      min_f0(50),
+      max_f0(400),
+      soft_min_f0(10.0),
+      penalty_factor(0.1),
+      lowpass_cutoff(1500),
+      upsample_cutoff(2000),
+      resample_freq(4000),
+      delta_pitch(0.01),
+      nccf_ballast(0.625),
+      lowpass_filter_width(2),
+      upsample_filter_width(5) {}
  void Register(ParseOptions *po) {
-    frame_opts.Register(po);
+    po->Register("sample-frequency", &samp_freq,
+                 "Waveform data sample frequency (must match the waveform file, "
+                 "if specified there)");
+    po->Register("frame-length", &frame_length_ms, "Frame length in milliseconds");
+    po->Register("frame-shift", &frame_shift_ms, "Frame shift in milliseconds");
+    po->Register("preemphasis-coefficient", &preemph_coeff,
+                 "Coefficient for use in signal preemphasis");
    po->Register("min-f0", &min_f0,
                 "min. F0 to search for (Hz)");
    po->Register("max-f0", &max_f0,
@ -91,10 +105,10 @@ struct PitchExtractionOptions {
                 "Integer that determines filter width when upsampling NCCF");
  }
  int32 NccfWindowSize() const {
-    return static_cast<int32>(resample_freq * 0.001 * frame_opts.frame_length_ms);
+    return static_cast<int32>(resample_freq * 0.001 * frame_length_ms);
  }
  int32 NccfWindowShift() const {
-    return static_cast<int32>(resample_freq * 0.001 * frame_opts.frame_shift_ms);
+    return static_cast<int32>(resample_freq * 0.001 * frame_shift_ms);
  }
 };