feature-window.cc 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235
  1. // kaldi-native-fbank/csrc/feature-window.cc
  2. //
  3. // Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang)
  4. // This file is copied/modified from kaldi/src/feat/feature-window.cc
  5. #include "feature-window.h"
  6. #include <algorithm>
  7. #include <cmath>
  8. #include <limits>
  9. #include <vector>
  10. #ifndef M_2PI
  11. #define M_2PI 6.283185307179586476925286766559005
  12. #endif
  13. namespace knf {
  14. std::ostream &operator<<(std::ostream &os, const FrameExtractionOptions &opts) {
  15. os << opts.ToString();
  16. return os;
  17. }
  18. FeatureWindowFunction::FeatureWindowFunction(const FrameExtractionOptions &opts)
  19. : window_(opts.WindowSize()) {
  20. int32_t frame_length = opts.WindowSize();
  21. KNF_CHECK_GT(frame_length, 0);
  22. float *window_data = window_.data();
  23. double a = M_2PI / (frame_length - 1);
  24. for (int32_t i = 0; i < frame_length; i++) {
  25. double i_fl = static_cast<double>(i);
  26. if (opts.window_type == "hanning") {
  27. window_data[i] = 0.5 - 0.5 * cos(a * i_fl);
  28. } else if (opts.window_type == "sine") {
  29. // when you are checking ws wikipedia, please
  30. // note that 0.5 * a = M_PI/(frame_length-1)
  31. window_data[i] = sin(0.5 * a * i_fl);
  32. } else if (opts.window_type == "hamming") {
  33. window_data[i] = 0.54 - 0.46 * cos(a * i_fl);
  34. } else if (opts.window_type ==
  35. "povey") { // like hamming but goes to zero at edges.
  36. window_data[i] = pow(0.5 - 0.5 * cos(a * i_fl), 0.85);
  37. } else if (opts.window_type == "rectangular") {
  38. window_data[i] = 1.0;
  39. } else if (opts.window_type == "blackman") {
  40. window_data[i] = opts.blackman_coeff - 0.5 * cos(a * i_fl) +
  41. (0.5 - opts.blackman_coeff) * cos(2 * a * i_fl);
  42. } else {
  43. KNF_LOG(FATAL) << "Invalid window type " << opts.window_type;
  44. }
  45. }
  46. }
  47. void FeatureWindowFunction::Apply(float *wave) const {
  48. int32_t window_size = window_.size();
  49. const float *p = window_.data();
  50. for (int32_t k = 0; k != window_size; ++k) {
  51. wave[k] *= p[k];
  52. }
  53. }
  54. int64_t FirstSampleOfFrame(int32_t frame, const FrameExtractionOptions &opts) {
  55. int64_t frame_shift = opts.WindowShift();
  56. if (opts.snip_edges) {
  57. return frame * frame_shift;
  58. } else {
  59. int64_t midpoint_of_frame = frame_shift * frame + frame_shift / 2,
  60. beginning_of_frame = midpoint_of_frame - opts.WindowSize() / 2;
  61. return beginning_of_frame;
  62. }
  63. }
  64. int32_t NumFrames(int64_t num_samples, const FrameExtractionOptions &opts,
  65. bool flush /*= true*/) {
  66. int64_t frame_shift = opts.WindowShift();
  67. int64_t frame_length = opts.WindowSize();
  68. if (opts.snip_edges) {
  69. // with --snip-edges=true (the default), we use a HTK-like approach to
  70. // determining the number of frames-- all frames have to fit completely into
  71. // the waveform, and the first frame begins at sample zero.
  72. if (num_samples < frame_length)
  73. return 0;
  74. else
  75. return (1 + ((num_samples - frame_length) / frame_shift));
  76. // You can understand the expression above as follows: 'num_samples -
  77. // frame_length' is how much room we have to shift the frame within the
  78. // waveform; 'frame_shift' is how much we shift it each time; and the ratio
  79. // is how many times we can shift it (integer arithmetic rounds down).
  80. } else {
  81. // if --snip-edges=false, the number of frames is determined by rounding the
  82. // (file-length / frame-shift) to the nearest integer. The point of this
  83. // formula is to make the number of frames an obvious and predictable
  84. // function of the frame shift and signal length, which makes many
  85. // segmentation-related questions simpler.
  86. //
  87. // Because integer division in C++ rounds toward zero, we add (half the
  88. // frame-shift minus epsilon) before dividing, to have the effect of
  89. // rounding towards the closest integer.
  90. int32_t num_frames = (num_samples + (frame_shift / 2)) / frame_shift;
  91. if (flush) return num_frames;
  92. // note: 'end' always means the last plus one, i.e. one past the last.
  93. int64_t end_sample_of_last_frame =
  94. FirstSampleOfFrame(num_frames - 1, opts) + frame_length;
  95. // the following code is optimized more for clarity than efficiency.
  96. // If flush == false, we can't output frames that extend past the end
  97. // of the signal.
  98. while (num_frames > 0 && end_sample_of_last_frame > num_samples) {
  99. num_frames--;
  100. end_sample_of_last_frame -= frame_shift;
  101. }
  102. return num_frames;
  103. }
  104. }
  105. void ExtractWindow(int64_t sample_offset, const float *wave, std::size_t wave_size,
  106. int32_t f, const FrameExtractionOptions &opts,
  107. const FeatureWindowFunction &window_function,
  108. std::vector<float> *window,
  109. float *log_energy_pre_window /*= nullptr*/) {
  110. KNF_CHECK(sample_offset >= 0 && wave_size != 0);
  111. int32_t frame_length = opts.WindowSize();
  112. int32_t frame_length_padded = opts.PaddedWindowSize();
  113. int64_t num_samples = sample_offset + wave_size;
  114. int64_t start_sample = FirstSampleOfFrame(f, opts);
  115. int64_t end_sample = start_sample + frame_length;
  116. if (opts.snip_edges) {
  117. KNF_CHECK(start_sample >= sample_offset && end_sample <= num_samples);
  118. } else {
  119. KNF_CHECK(sample_offset == 0 || start_sample >= sample_offset);
  120. }
  121. if (window->size() != frame_length_padded) {
  122. window->resize(frame_length_padded);
  123. }
  124. // wave_start and wave_end are start and end indexes into 'wave', for the
  125. // piece of wave that we're trying to extract.
  126. int32_t wave_start = int32_t(start_sample - sample_offset);
  127. int32_t wave_end = wave_start + frame_length;
  128. if (wave_start >= 0 && wave_end <= wave_size) {
  129. // the normal case-- no edge effects to consider.
  130. std::copy(wave + wave_start,
  131. wave + wave_start + frame_length, window->data());
  132. } else {
  133. // Deal with any end effects by reflection, if needed. This code will only
  134. // be reached for about two frames per utterance, so we don't concern
  135. // ourselves excessively with efficiency.
  136. int32_t wave_dim = wave_size;
  137. for (int32_t s = 0; s < frame_length; ++s) {
  138. int32_t s_in_wave = s + wave_start;
  139. while (s_in_wave < 0 || s_in_wave >= wave_dim) {
  140. // reflect around the beginning or end of the wave.
  141. // e.g. -1 -> 0, -2 -> 1.
  142. // dim -> dim - 1, dim + 1 -> dim - 2.
  143. // the code supports repeated reflections, although this
  144. // would only be needed in pathological cases.
  145. if (s_in_wave < 0)
  146. s_in_wave = -s_in_wave - 1;
  147. else
  148. s_in_wave = 2 * wave_dim - 1 - s_in_wave;
  149. }
  150. (*window)[s] = wave[s_in_wave];
  151. }
  152. }
  153. ProcessWindow(opts, window_function, window->data(), log_energy_pre_window);
  154. }
  155. static void RemoveDcOffset(float *d, int32_t n) {
  156. float sum = 0;
  157. for (int32_t i = 0; i != n; ++i) {
  158. sum += d[i];
  159. }
  160. float mean = sum / n;
  161. for (int32_t i = 0; i != n; ++i) {
  162. d[i] -= mean;
  163. }
  164. }
  165. float InnerProduct(const float *a, const float *b, int32_t n) {
  166. float sum = 0;
  167. for (int32_t i = 0; i != n; ++i) {
  168. sum += a[i] * b[i];
  169. }
  170. return sum;
  171. }
  172. static void Preemphasize(float *d, int32_t n, float preemph_coeff) {
  173. if (preemph_coeff == 0.0) {
  174. return;
  175. }
  176. KNF_CHECK(preemph_coeff >= 0.0 && preemph_coeff <= 1.0);
  177. for (int32_t i = n - 1; i > 0; --i) {
  178. d[i] -= preemph_coeff * d[i - 1];
  179. }
  180. d[0] -= preemph_coeff * d[0];
  181. }
  182. void ProcessWindow(const FrameExtractionOptions &opts,
  183. const FeatureWindowFunction &window_function, float *window,
  184. float *log_energy_pre_window /*= nullptr*/) {
  185. int32_t frame_length = opts.WindowSize();
  186. if (opts.remove_dc_offset) {
  187. RemoveDcOffset(window, frame_length);
  188. }
  189. if (log_energy_pre_window != NULL) {
  190. float energy = std::max<float>(InnerProduct(window, window, frame_length),
  191. std::numeric_limits<float>::epsilon());
  192. *log_energy_pre_window = std::log(energy);
  193. }
  194. if (opts.preemph_coeff != 0.0) {
  195. Preemphasize(window, frame_length, opts.preemph_coeff);
  196. }
  197. window_function.Apply(window);
  198. }
  199. } // namespace knf