feature-window.h 6.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172
  1. // kaldi-native-fbank/csrc/feature-window.h
  2. //
  3. // Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang)
  4. // This file is copied/modified from kaldi/src/feat/feature-window.h
  5. #ifndef KALDI_NATIVE_FBANK_CSRC_FEATURE_WINDOW_H_
  6. #define KALDI_NATIVE_FBANK_CSRC_FEATURE_WINDOW_H_
  7. #include <sstream>
  8. #include <string>
  9. #include <vector>
  10. #include "log.h"
  11. namespace knf {
  12. inline int32_t RoundUpToNearestPowerOfTwo(int32_t n) {
  13. // copied from kaldi/src/base/kaldi-math.cc
  14. KNF_CHECK_GT(n, 0);
  15. n--;
  16. n |= n >> 1;
  17. n |= n >> 2;
  18. n |= n >> 4;
  19. n |= n >> 8;
  20. n |= n >> 16;
  21. return n + 1;
  22. }
  23. struct FrameExtractionOptions {
  24. float samp_freq = 16000;
  25. float frame_shift_ms = 10.0f; // in milliseconds.
  26. float frame_length_ms = 25.0f; // in milliseconds.
  27. float dither = 1.0f; // Amount of dithering, 0.0 means no dither.
  28. float preemph_coeff = 0.97f; // Preemphasis coefficient.
  29. bool remove_dc_offset = true; // Subtract mean of wave before FFT.
  30. std::string window_type = "povey"; // e.g. Hamming window
  31. // May be "hamming", "rectangular", "povey", "hanning", "sine", "blackman"
  32. // "povey" is a window I made to be similar to Hamming but to go to zero at
  33. // the edges, it's pow((0.5 - 0.5*cos(n/N*2*pi)), 0.85) I just don't think the
  34. // Hamming window makes sense as a windowing function.
  35. bool round_to_power_of_two = true;
  36. float blackman_coeff = 0.42f;
  37. bool snip_edges = true;
  38. // bool allow_downsample = false;
  39. // bool allow_upsample = false;
  40. int32_t WindowShift() const {
  41. return static_cast<int32_t>(samp_freq * 0.001f * frame_shift_ms);
  42. }
  43. int32_t WindowSize() const {
  44. return static_cast<int32_t>(samp_freq * 0.001f * frame_length_ms);
  45. }
  46. int32_t PaddedWindowSize() const {
  47. return (round_to_power_of_two ? RoundUpToNearestPowerOfTwo(WindowSize())
  48. : WindowSize());
  49. }
  50. std::string ToString() const {
  51. std::ostringstream os;
  52. #define KNF_PRINT(x) os << #x << ": " << x << "\n"
  53. KNF_PRINT(samp_freq);
  54. KNF_PRINT(frame_shift_ms);
  55. KNF_PRINT(frame_length_ms);
  56. KNF_PRINT(dither);
  57. KNF_PRINT(preemph_coeff);
  58. KNF_PRINT(remove_dc_offset);
  59. KNF_PRINT(window_type);
  60. KNF_PRINT(round_to_power_of_two);
  61. KNF_PRINT(blackman_coeff);
  62. KNF_PRINT(snip_edges);
  63. // KNF_PRINT(allow_downsample);
  64. // KNF_PRINT(allow_upsample);
  65. #undef KNF_PRINT
  66. return os.str();
  67. }
  68. };
  69. std::ostream &operator<<(std::ostream &os, const FrameExtractionOptions &opts);
  70. class FeatureWindowFunction {
  71. public:
  72. FeatureWindowFunction() = default;
  73. explicit FeatureWindowFunction(const FrameExtractionOptions &opts);
  74. /**
  75. * @param wave Pointer to a 1-D array of shape [window_size].
  76. * It is modified in-place: wave[i] = wave[i] * window_[i].
  77. * @param
  78. */
  79. void Apply(float *wave) const;
  80. private:
  81. std::vector<float> window_; // of size opts.WindowSize()
  82. };
  83. int64_t FirstSampleOfFrame(int32_t frame, const FrameExtractionOptions &opts);
  84. /**
  85. This function returns the number of frames that we can extract from a wave
  86. file with the given number of samples in it (assumed to have the same
  87. sampling rate as specified in 'opts').
  88. @param [in] num_samples The number of samples in the wave file.
  89. @param [in] opts The frame-extraction options class
  90. @param [in] flush True if we are asserting that this number of samples
  91. is 'all there is', false if we expecting more data to possibly come in. This
  92. only makes a difference to the answer
  93. if opts.snips_edges== false. For offline feature extraction you always want
  94. flush == true. In an online-decoding context, once you know (or decide) that
  95. no more data is coming in, you'd call it with flush == true at the end to
  96. flush out any remaining data.
  97. */
  98. int32_t NumFrames(int64_t num_samples, const FrameExtractionOptions &opts,
  99. bool flush = true);
  100. /*
  101. ExtractWindow() extracts a windowed frame of waveform (possibly with a
  102. power-of-two, padded size, depending on the config), including all the
  103. processing done by ProcessWindow().
  104. @param [in] sample_offset If 'wave' is not the entire waveform, but
  105. part of it to the left has been discarded, then the
  106. number of samples prior to 'wave' that we have
  107. already discarded. Set this to zero if you are
  108. processing the entire waveform in one piece, or
  109. if you get 'no matching function' compilation
  110. errors when updating the code.
  111. @param [in] wave The waveform
  112. @param [in] f The frame index to be extracted, with
  113. 0 <= f < NumFrames(sample_offset + wave.Dim(), opts, true)
  114. @param [in] opts The options class to be used
  115. @param [in] window_function The windowing function, as derived from the
  116. options class.
  117. @param [out] window The windowed, possibly-padded waveform to be
  118. extracted. Will be resized as needed.
  119. @param [out] log_energy_pre_window If non-NULL, the log-energy of
  120. the signal prior to pre-emphasis and multiplying by
  121. the windowing function will be written to here.
  122. */
  123. void ExtractWindow(int64_t sample_offset, const float *wave, std::size_t wave_size,
  124. int32_t f, const FrameExtractionOptions &opts,
  125. const FeatureWindowFunction &window_function,
  126. std::vector<float> *window,
  127. float *log_energy_pre_window = nullptr);
  128. /**
  129. This function does all the windowing steps after actually
  130. extracting the windowed signal: depending on the
  131. configuration, it does dithering, dc offset removal,
  132. preemphasis, and multiplication by the windowing function.
  133. @param [in] opts The options class to be used
  134. @param [in] window_function The windowing function-- should have
  135. been initialized using 'opts'.
  136. @param [in,out] window A vector of size opts.WindowSize(). Note:
  137. it will typically be a sub-vector of a larger vector of size
  138. opts.PaddedWindowSize(), with the remaining samples zero,
  139. as the FFT code is more efficient if it operates on data with
  140. power-of-two size.
  141. @param [out] log_energy_pre_window If non-NULL, then after dithering and
  142. DC offset removal, this function will write to this pointer the log of
  143. the total energy (i.e. sum-squared) of the frame.
  144. */
  145. void ProcessWindow(const FrameExtractionOptions &opts,
  146. const FeatureWindowFunction &window_function, float *window,
  147. float *log_energy_pre_window = nullptr);
  148. // Compute the inner product of two vectors
  149. float InnerProduct(const float *a, const float *b, int32_t n);
  150. } // namespace knf
  151. #endif // KALDI_NATIVE_FBANK_CSRC_FEATURE_WINDOW_H_