Explorar el Código

unity.cpp speech_encoder_frontend+speech_encoder+adaptor (#81)

* merge dnn_unity.cpp

* local fixes + kaldi-native-bank

* fbank frontend

* e2e + tests

* revert (maybe) unnecessary changes

* fix merge

* Update test_unity_cpp.py

* Update fixture & align batch dim handling
Ning hace 1 año
padre
commit
1589c529bf
Se han modificado 51 ficheros con 8321 adiciones y 86 borrados
  1. 10 1
      ggml/CMakeLists.txt
  2. 8 0
      ggml/examples/kaldi-native-fbank/CMakeLists.txt
  3. 93 0
      ggml/examples/kaldi-native-fbank/csrc/CMakeLists.txt
  4. 120 0
      ggml/examples/kaldi-native-fbank/csrc/feature-fbank.cc
  5. 134 0
      ggml/examples/kaldi-native-fbank/csrc/feature-fbank.h
  6. 49 0
      ggml/examples/kaldi-native-fbank/csrc/feature-functions.cc
  7. 38 0
      ggml/examples/kaldi-native-fbank/csrc/feature-functions.h
  8. 235 0
      ggml/examples/kaldi-native-fbank/csrc/feature-window.cc
  9. 172 0
      ggml/examples/kaldi-native-fbank/csrc/feature-window.h
  10. 2975 0
      ggml/examples/kaldi-native-fbank/csrc/fftsg.c
  11. 142 0
      ggml/examples/kaldi-native-fbank/csrc/log.cc
  12. 383 0
      ggml/examples/kaldi-native-fbank/csrc/log.h
  13. 257 0
      ggml/examples/kaldi-native-fbank/csrc/mel-computations.cc
  14. 117 0
      ggml/examples/kaldi-native-fbank/csrc/mel-computations.h
  15. 166 0
      ggml/examples/kaldi-native-fbank/csrc/online-feature.cc
  16. 148 0
      ggml/examples/kaldi-native-fbank/csrc/online-feature.h
  17. 67 0
      ggml/examples/kaldi-native-fbank/csrc/rfft.cc
  18. 56 0
      ggml/examples/kaldi-native-fbank/csrc/rfft.h
  19. 73 0
      ggml/examples/kaldi-native-fbank/csrc/test-log.cc
  20. 48 0
      ggml/examples/kaldi-native-fbank/csrc/test-online-fbank.cc
  21. 59 0
      ggml/examples/kaldi-native-fbank/csrc/test-online-feature.cc
  22. 52 0
      ggml/examples/kaldi-native-fbank/csrc/test-rfft.cc
  23. 2 0
      ggml/examples/kaldi-native-fbank/python/CMakeLists.txt
  24. 28 0
      ggml/examples/kaldi-native-fbank/python/csrc/CMakeLists.txt
  25. 57 0
      ggml/examples/kaldi-native-fbank/python/csrc/feature-fbank.cc
  26. 30 0
      ggml/examples/kaldi-native-fbank/python/csrc/feature-fbank.h
  27. 66 0
      ggml/examples/kaldi-native-fbank/python/csrc/feature-window.cc
  28. 30 0
      ggml/examples/kaldi-native-fbank/python/csrc/feature-window.h
  29. 37 0
      ggml/examples/kaldi-native-fbank/python/csrc/kaldi-native-fbank.cc
  30. 27 0
      ggml/examples/kaldi-native-fbank/python/csrc/kaldi-native-fbank.h
  31. 58 0
      ggml/examples/kaldi-native-fbank/python/csrc/mel-computations.cc
  32. 30 0
      ggml/examples/kaldi-native-fbank/python/csrc/mel-computations.h
  33. 68 0
      ggml/examples/kaldi-native-fbank/python/csrc/online-feature.cc
  34. 30 0
      ggml/examples/kaldi-native-fbank/python/csrc/online-feature.h
  35. 134 0
      ggml/examples/kaldi-native-fbank/python/csrc/utils.cc
  36. 52 0
      ggml/examples/kaldi-native-fbank/python/csrc/utils.h
  37. 6 0
      ggml/examples/kaldi-native-fbank/python/kaldi_native_fbank/__init__.py
  38. 31 0
      ggml/examples/kaldi-native-fbank/python/tests/CMakeLists.txt
  39. 198 0
      ggml/examples/kaldi-native-fbank/python/tests/test_fbank_options.py
  40. 119 0
      ggml/examples/kaldi-native-fbank/python/tests/test_frame_extraction_options.py
  41. 107 0
      ggml/examples/kaldi-native-fbank/python/tests/test_mel_bank_options.py
  42. 48 0
      ggml/examples/kaldi-native-fbank/python/tests/test_online_fbank.py
  43. 1 1
      ggml/examples/unity/CMakeLists.txt
  44. 336 3
      ggml/examples/unity/fairseq2.cpp
  45. 51 0
      ggml/examples/unity/fairseq2.h
  46. 50 1
      ggml/ggml_convert.py
  47. 36 4
      ggml/include/ggml/ggml.h
  48. 2 2
      ggml/src/CMakeLists.txt
  49. 1069 71
      ggml/src/ggml.c
  50. 216 2
      ggml/test_unity_cpp.py
  51. 0 1
      ggml/third_party_ggml.py

+ 10 - 1
ggml/CMakeLists.txt

@@ -4,6 +4,7 @@ project(ggml VERSION 0.1.0)
 set(CMAKE_EXPORT_COMPILE_COMMANDS "on")
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
 set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib")
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 
 if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
     set(GGML_STANDALONE ON)
@@ -145,12 +146,20 @@ endif()
 # dependencies
 
 set(CMAKE_C_STANDARD   11)
-set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD 14)
 
 find_package(Threads REQUIRED)
 
 # main
 
+file(GLOB KALDI_NATIVE_FBANK_SOURCES
+     "${CMAKE_CURRENT_SOURCE_DIR}/examples/kaldi-native-fbank/csrc/*"
+)
+add_library(kaldi-native-fbank STATIC ${KALDI_NATIVE_FBANK_SOURCES})
+target_include_directories(kaldi-native-fbank PUBLIC
+  ${CMAKE_CURRENT_SOURCE_DIR}/examples/kaldi-native-fbank/csrc
+)
+
 if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
     set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
     set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "RelWithDebInfo")

+ 8 - 0
ggml/examples/kaldi-native-fbank/CMakeLists.txt

@@ -0,0 +1,8 @@
+add_subdirectory(csrc)
+
+if(KALDI_NATIVE_FBANK_BUILD_PYTHON)
+  message(STATUS "Building Python")
+  add_subdirectory(python)
+else()
+  message(STATUS "Disable building Python")
+endif()

+ 93 - 0
ggml/examples/kaldi-native-fbank/csrc/CMakeLists.txt

@@ -0,0 +1,93 @@
+
+include_directories(${PROJECT_SOURCE_DIR})
+set(sources
+  feature-fbank.cc
+  feature-functions.cc
+  feature-window.cc
+  fftsg.c
+  mel-computations.cc
+  online-feature.cc
+  rfft.cc
+)
+
+if(KALDI_NATIVE_FBANK_ENABLE_CHECK)
+  list(APPEND sources log.cc)
+endif()
+
+add_library(kaldi-native-fbank-core ${sources})
+if(KALDI_NATIVE_FBANK_ENABLE_CHECK)
+  target_compile_definitions(kaldi-native-fbank-core PUBLIC KNF_ENABLE_CHECK=1)
+
+  if(KNF_HAVE_EXECINFO_H)
+    target_compile_definitions(kaldi-native-fbank-core PRIVATE KNF_HAVE_EXECINFO_H=1)
+  endif()
+
+  if(KNF_HAVE_CXXABI_H)
+    target_compile_definitions(kaldi-native-fbank-core PRIVATE KNF_HAVE_CXXABI_H=1)
+  endif()
+endif()
+
+# We are using std::call_once() in log.h,which requires us to link with -pthread
+if(NOT WIN32 AND KALDI_NATIVE_FBANK_ENABLE_CHECK)
+  target_link_libraries(kaldi-native-fbank-core -pthread)
+endif()
+
+if(KALDI_NATIVE_FBANK_BUILD_TESTS)
+  add_executable(test-online-fbank test-online-fbank.cc)
+  target_link_libraries(test-online-fbank kaldi-native-fbank-core)
+endif()
+
+function(kaldi_native_fbank_add_test source)
+  get_filename_component(name ${source} NAME_WE)
+  add_executable(${name} "${source}")
+  target_link_libraries(${name}
+    PRIVATE
+      kaldi-native-fbank-core
+      gtest
+      gtest_main
+  )
+
+  add_test(NAME "Test.${name}"
+    COMMAND
+    $<TARGET_FILE:${name}>
+  )
+endfunction()
+
+# please sort the source files alphabetically
+set(test_srcs
+  # test-online-feature.cc
+  test-log.cc
+  test-rfft.cc
+)
+
+if(KALDI_NATIVE_FBANK_BUILD_TESTS)
+  foreach(source IN LISTS test_srcs)
+    kaldi_native_fbank_add_test(${source})
+  endforeach()
+endif()
+
+install(TARGETS kaldi-native-fbank-core
+  DESTINATION lib
+)
+
+if(KALDI_NATIVE_FBANK_BUILD_TESTS)
+  install(TARGETS test-online-fbank
+    DESTINATION bin
+  )
+endif()
+
+file(MAKE_DIRECTORY
+  DESTINATION
+    ${PROJECT_BINARY_DIR}/include/kaldi-native-fbank/csrc
+)
+file(GLOB_RECURSE all_headers *.h)
+
+file(COPY
+  ${all_headers}
+  DESTINATION
+    ${PROJECT_BINARY_DIR}/include/kaldi-native-fbank/csrc
+)
+
+install(FILES ${all_headers}
+  DESTINATION include/kaldi-native-fbank/csrc
+)

+ 120 - 0
ggml/examples/kaldi-native-fbank/csrc/feature-fbank.cc

@@ -0,0 +1,120 @@
+/**
+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// This file is copied/modified from kaldi/src/feat/feature-fbank.cc
+//
+#include "feature-fbank.h"
+
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <vector>
+
+#include "feature-functions.h"
+
+namespace knf {
+
+static void Sqrt(float *in_out, int32_t n) {
+  for (int32_t i = 0; i != n; ++i) {
+    in_out[i] = std::sqrt(in_out[i]);
+  }
+}
+
+std::ostream &operator<<(std::ostream &os, const FbankOptions &opts) {
+  os << opts.ToString();
+  return os;
+}
+
+FbankComputer::FbankComputer(const FbankOptions &opts)
+    : opts_(opts), rfft_(opts.frame_opts.PaddedWindowSize()) {
+  if (opts.energy_floor > 0.0f) {
+    log_energy_floor_ = logf(opts.energy_floor);
+  }
+
+  // We'll definitely need the filterbanks info for VTLN warping factor 1.0.
+  // [note: this call caches it.]
+  GetMelBanks(1.0f);
+}
+
+FbankComputer::~FbankComputer() {
+  for (auto iter = mel_banks_.begin(); iter != mel_banks_.end(); ++iter)
+    delete iter->second;
+}
+
+const MelBanks *FbankComputer::GetMelBanks(float vtln_warp) {
+  MelBanks *this_mel_banks = nullptr;
+
+  // std::map<float, MelBanks *>::iterator iter = mel_banks_.find(vtln_warp);
+  auto iter = mel_banks_.find(vtln_warp);
+  if (iter == mel_banks_.end()) {
+    this_mel_banks = new MelBanks(opts_.mel_opts, opts_.frame_opts, vtln_warp);
+    mel_banks_[vtln_warp] = this_mel_banks;
+  } else {
+    this_mel_banks = iter->second;
+  }
+  return this_mel_banks;
+}
+
+void FbankComputer::Compute(float signal_raw_log_energy, float vtln_warp,
+                            std::vector<float> *signal_frame, float *feature) {
+  const MelBanks &mel_banks = *(GetMelBanks(vtln_warp));
+
+  KNF_CHECK_EQ(signal_frame->size(), opts_.frame_opts.PaddedWindowSize());
+
+  // Compute energy after window function (not the raw one).
+  if (opts_.use_energy && !opts_.raw_energy) {
+    signal_raw_log_energy = std::log(
+        std::max<float>(InnerProduct(signal_frame->data(), signal_frame->data(),
+                                     signal_frame->size()),
+                        std::numeric_limits<float>::epsilon()));
+  }
+  rfft_.Compute(signal_frame->data());  // signal_frame is modified in-place
+  ComputePowerSpectrum(signal_frame);
+
+  // Use magnitude instead of power if requested.
+  if (!opts_.use_power) {
+    Sqrt(signal_frame->data(), signal_frame->size() / 2 + 1);
+  }
+
+  int32_t mel_offset = ((opts_.use_energy && !opts_.htk_compat) ? 1 : 0);
+
+  // Its length is opts_.mel_opts.num_bins
+  float *mel_energies = feature + mel_offset;
+
+  // Sum with mel filter banks over the power spectrum
+  mel_banks.Compute(signal_frame->data(), mel_energies);
+
+  if (opts_.use_log_fbank) {
+    // Avoid log of zero (which should be prevented anyway by dithering).
+    for (int32_t i = 0; i != opts_.mel_opts.num_bins; ++i) {
+      auto t = std::max(mel_energies[i], std::numeric_limits<float>::epsilon());
+      mel_energies[i] = std::log(t);
+    }
+  }
+
+  // Copy energy as first value (or the last, if htk_compat == true).
+  if (opts_.use_energy) {
+    if (opts_.energy_floor > 0.0 && signal_raw_log_energy < log_energy_floor_) {
+      signal_raw_log_energy = log_energy_floor_;
+    }
+    int32_t energy_index = opts_.htk_compat ? opts_.mel_opts.num_bins : 0;
+    feature[energy_index] = signal_raw_log_energy;
+  }
+}
+
+}  // namespace knf

+ 134 - 0
ggml/examples/kaldi-native-fbank/csrc/feature-fbank.h

@@ -0,0 +1,134 @@
+/**
+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// This file is copied/modified from kaldi/src/feat/feature-fbank.h
+
+#ifndef KALDI_NATIVE_FBANK_CSRC_FEATURE_FBANK_H_
+#define KALDI_NATIVE_FBANK_CSRC_FEATURE_FBANK_H_
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "feature-window.h"
+#include "mel-computations.h"
+#include "rfft.h"
+
+namespace knf {
+
+struct FbankOptions {
+  FrameExtractionOptions frame_opts;
+  MelBanksOptions mel_opts;
+  // append an extra dimension with energy to the filter banks
+  bool use_energy = false;
+  float energy_floor = 0.0f;  // active iff use_energy==true
+
+  // If true, compute log_energy before preemphasis and windowing
+  // If false, compute log_energy after preemphasis ans windowing
+  bool raw_energy = true;  // active iff use_energy==true
+
+  // If true, put energy last (if using energy)
+  // If false, put energy first
+  bool htk_compat = false;  // active iff use_energy==true
+
+  // if true (default), produce log-filterbank, else linear
+  bool use_log_fbank = true;
+
+  // if true (default), use power in filterbank
+  // analysis, else magnitude.
+  bool use_power = true;
+
+  FbankOptions() { mel_opts.num_bins = 23; }
+
+  std::string ToString() const {
+    std::ostringstream os;
+    os << "frame_opts: \n";
+    os << frame_opts << "\n";
+    os << "\n";
+
+    os << "mel_opts: \n";
+    os << mel_opts << "\n";
+
+    os << "use_energy: " << use_energy << "\n";
+    os << "energy_floor: " << energy_floor << "\n";
+    os << "raw_energy: " << raw_energy << "\n";
+    os << "htk_compat: " << htk_compat << "\n";
+    os << "use_log_fbank: " << use_log_fbank << "\n";
+    os << "use_power: " << use_power << "\n";
+    return os.str();
+  }
+};
+
+std::ostream &operator<<(std::ostream &os, const FbankOptions &opts);
+
+class FbankComputer {
+ public:
+  using Options = FbankOptions;
+
+  explicit FbankComputer(const FbankOptions &opts);
+  ~FbankComputer();
+
+  int32_t Dim() const {
+    return opts_.mel_opts.num_bins + (opts_.use_energy ? 1 : 0);
+  }
+
+  // if true, compute log_energy_pre_window but after dithering and dc removal
+  bool NeedRawLogEnergy() const { return opts_.use_energy && opts_.raw_energy; }
+
+  const FrameExtractionOptions &GetFrameOptions() const {
+    return opts_.frame_opts;
+  }
+
+  const FbankOptions &GetOptions() const { return opts_; }
+
+  /**
+     Function that computes one frame of features from
+     one frame of signal.
+
+     @param [in] signal_raw_log_energy The log-energy of the frame of the signal
+         prior to windowing and pre-emphasis, or
+         log(numeric_limits<float>::min()), whichever is greater.  Must be
+         ignored by this function if this class returns false from
+         this->NeedsRawLogEnergy().
+     @param [in] vtln_warp  The VTLN warping factor that the user wants
+         to be applied when computing features for this utterance.  Will
+         normally be 1.0, meaning no warping is to be done.  The value will
+         be ignored for feature types that don't support VLTN, such as
+         spectrogram features.
+     @param [in] signal_frame  One frame of the signal,
+       as extracted using the function ExtractWindow() using the options
+       returned by this->GetFrameOptions().  The function will use the
+       vector as a workspace, which is why it's a non-const pointer.
+     @param [out] feature  Pointer to a vector of size this->Dim(), to which
+         the computed feature will be written. It should be pre-allocated.
+  */
+  void Compute(float signal_raw_log_energy, float vtln_warp,
+               std::vector<float> *signal_frame, float *feature);
+
+ private:
+  const MelBanks *GetMelBanks(float vtln_warp);
+
+  FbankOptions opts_;
+  float log_energy_floor_;
+  std::map<float, MelBanks *> mel_banks_;  // float is VTLN coefficient.
+  Rfft rfft_;
+};
+
+}  // namespace knf
+
+#endif  // KALDI_NATIVE_FBANK_CSRC_FEATURE_FBANK_H_

+ 49 - 0
ggml/examples/kaldi-native-fbank/csrc/feature-functions.cc

@@ -0,0 +1,49 @@
+/**
+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// This file is copied/modified from kaldi/src/feat/feature-functions.cc
+
+#include "feature-functions.h"
+
+#include <cstdint>
+#include <vector>
+
+namespace knf {
+
+void ComputePowerSpectrum(std::vector<float> *complex_fft) {
+  int32_t dim = complex_fft->size();
+
+  // now we have in complex_fft, first half of complex spectrum
+  // it's stored as [real0, realN/2, real1, im1, real2, im2, ...]
+
+  float *p = complex_fft->data();
+  int32_t half_dim = dim / 2;
+  float first_energy = p[0] * p[0];
+  float last_energy = p[1] * p[1];  // handle this special case
+
+  for (int32_t i = 1; i < half_dim; ++i) {
+    float real = p[i * 2];
+    float im = p[i * 2 + 1];
+    p[i] = real * real + im * im;
+  }
+  p[0] = first_energy;
+  p[half_dim] = last_energy;  // Will actually never be used, and anyway
+  // if the signal has been bandlimited sensibly this should be zero.
+}
+
+}  // namespace knf

+ 38 - 0
ggml/examples/kaldi-native-fbank/csrc/feature-functions.h

@@ -0,0 +1,38 @@
+/**
+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// This file is copied/modified from kaldi/src/feat/feature-functions.h
+#ifndef KALDI_NATIVE_FBANK_CSRC_FEATURE_FUNCTIONS_H_
+#define KALDI_NATIVE_FBANK_CSRC_FEATURE_FUNCTIONS_H_
+
+#include <vector>
+namespace knf {
+
+// ComputePowerSpectrum converts a complex FFT (as produced by the FFT
+// functions in csrc/rfft.h), and converts it into
+// a power spectrum.  If the complex FFT is a vector of size n (representing
+// half of the complex FFT of a real signal of size n, as described there),
+// this function computes in the first (n/2) + 1 elements of it, the
+// energies of the fft bins from zero to the Nyquist frequency.  Contents of the
+// remaining (n/2) - 1 elements are undefined at output.
+
+void ComputePowerSpectrum(std::vector<float> *complex_fft);
+
+}  // namespace knf
+
+#endif  // KALDI_NATIVE_FBANK_CSRC_FEATURE_FUNCTIONS_H_

+ 235 - 0
ggml/examples/kaldi-native-fbank/csrc/feature-window.cc

@@ -0,0 +1,235 @@
+// kaldi-native-fbank/csrc/feature-window.cc
+//
+// Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
+
+// This file is copied/modified from kaldi/src/feat/feature-window.cc
+
+#include "feature-window.h"
+
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <vector>
+
+#ifndef M_2PI
+#define M_2PI 6.283185307179586476925286766559005
+#endif
+
+namespace knf {
+
+std::ostream &operator<<(std::ostream &os, const FrameExtractionOptions &opts) {
+  os << opts.ToString();
+  return os;
+}
+
+FeatureWindowFunction::FeatureWindowFunction(const FrameExtractionOptions &opts)
+    : window_(opts.WindowSize()) {
+  int32_t frame_length = opts.WindowSize();
+  KNF_CHECK_GT(frame_length, 0);
+
+  float *window_data = window_.data();
+
+  double a = M_2PI / (frame_length - 1);
+  for (int32_t i = 0; i < frame_length; i++) {
+    double i_fl = static_cast<double>(i);
+    if (opts.window_type == "hanning") {
+      window_data[i] = 0.5 - 0.5 * cos(a * i_fl);
+    } else if (opts.window_type == "sine") {
+      // when you are checking ws wikipedia, please
+      // note that 0.5 * a = M_PI/(frame_length-1)
+      window_data[i] = sin(0.5 * a * i_fl);
+    } else if (opts.window_type == "hamming") {
+      window_data[i] = 0.54 - 0.46 * cos(a * i_fl);
+    } else if (opts.window_type ==
+               "povey") {  // like hamming but goes to zero at edges.
+      window_data[i] = pow(0.5 - 0.5 * cos(a * i_fl), 0.85);
+    } else if (opts.window_type == "rectangular") {
+      window_data[i] = 1.0;
+    } else if (opts.window_type == "blackman") {
+      window_data[i] = opts.blackman_coeff - 0.5 * cos(a * i_fl) +
+                       (0.5 - opts.blackman_coeff) * cos(2 * a * i_fl);
+    } else {
+      KNF_LOG(FATAL) << "Invalid window type " << opts.window_type;
+    }
+  }
+}
+
+void FeatureWindowFunction::Apply(float *wave) const {
+  int32_t window_size = window_.size();
+  const float *p = window_.data();
+  for (int32_t k = 0; k != window_size; ++k) {
+    wave[k] *= p[k];
+  }
+}
+
+int64_t FirstSampleOfFrame(int32_t frame, const FrameExtractionOptions &opts) {
+  int64_t frame_shift = opts.WindowShift();
+  if (opts.snip_edges) {
+    return frame * frame_shift;
+  } else {
+    int64_t midpoint_of_frame = frame_shift * frame + frame_shift / 2,
+            beginning_of_frame = midpoint_of_frame - opts.WindowSize() / 2;
+    return beginning_of_frame;
+  }
+}
+
+int32_t NumFrames(int64_t num_samples, const FrameExtractionOptions &opts,
+                  bool flush /*= true*/) {
+  int64_t frame_shift = opts.WindowShift();
+  int64_t frame_length = opts.WindowSize();
+  if (opts.snip_edges) {
+    // with --snip-edges=true (the default), we use a HTK-like approach to
+    // determining the number of frames-- all frames have to fit completely into
+    // the waveform, and the first frame begins at sample zero.
+    if (num_samples < frame_length)
+      return 0;
+    else
+      return (1 + ((num_samples - frame_length) / frame_shift));
+    // You can understand the expression above as follows: 'num_samples -
+    // frame_length' is how much room we have to shift the frame within the
+    // waveform; 'frame_shift' is how much we shift it each time; and the ratio
+    // is how many times we can shift it (integer arithmetic rounds down).
+  } else {
+    // if --snip-edges=false, the number of frames is determined by rounding the
+    // (file-length / frame-shift) to the nearest integer.  The point of this
+    // formula is to make the number of frames an obvious and predictable
+    // function of the frame shift and signal length, which makes many
+    // segmentation-related questions simpler.
+    //
+    // Because integer division in C++ rounds toward zero, we add (half the
+    // frame-shift minus epsilon) before dividing, to have the effect of
+    // rounding towards the closest integer.
+    int32_t num_frames = (num_samples + (frame_shift / 2)) / frame_shift;
+
+    if (flush) return num_frames;
+
+    // note: 'end' always means the last plus one, i.e. one past the last.
+    int64_t end_sample_of_last_frame =
+        FirstSampleOfFrame(num_frames - 1, opts) + frame_length;
+
+    // the following code is optimized more for clarity than efficiency.
+    // If flush == false, we can't output frames that extend past the end
+    // of the signal.
+    while (num_frames > 0 && end_sample_of_last_frame > num_samples) {
+      num_frames--;
+      end_sample_of_last_frame -= frame_shift;
+    }
+    return num_frames;
+  }
+}
+
+void ExtractWindow(int64_t sample_offset, const float *wave, std::size_t wave_size,
+                   int32_t f, const FrameExtractionOptions &opts,
+                   const FeatureWindowFunction &window_function,
+                   std::vector<float> *window,
+                   float *log_energy_pre_window /*= nullptr*/) {
+  KNF_CHECK(sample_offset >= 0 && wave_size != 0);
+
+  int32_t frame_length = opts.WindowSize();
+  int32_t frame_length_padded = opts.PaddedWindowSize();
+
+  int64_t num_samples = sample_offset + wave_size;
+  int64_t start_sample = FirstSampleOfFrame(f, opts);
+  int64_t end_sample = start_sample + frame_length;
+
+  if (opts.snip_edges) {
+    KNF_CHECK(start_sample >= sample_offset && end_sample <= num_samples);
+  } else {
+    KNF_CHECK(sample_offset == 0 || start_sample >= sample_offset);
+  }
+
+  if (window->size() != frame_length_padded) {
+    window->resize(frame_length_padded);
+  }
+
+  // wave_start and wave_end are start and end indexes into 'wave', for the
+  // piece of wave that we're trying to extract.
+  int32_t wave_start = int32_t(start_sample - sample_offset);
+  int32_t wave_end = wave_start + frame_length;
+
+  if (wave_start >= 0 && wave_end <= wave_size) {
+    // the normal case-- no edge effects to consider.
+    std::copy(wave + wave_start,
+              wave + wave_start + frame_length, window->data());
+  } else {
+    // Deal with any end effects by reflection, if needed.  This code will only
+    // be reached for about two frames per utterance, so we don't concern
+    // ourselves excessively with efficiency.
+    int32_t wave_dim = wave_size;
+    for (int32_t s = 0; s < frame_length; ++s) {
+      int32_t s_in_wave = s + wave_start;
+      while (s_in_wave < 0 || s_in_wave >= wave_dim) {
+        // reflect around the beginning or end of the wave.
+        // e.g. -1 -> 0, -2 -> 1.
+        // dim -> dim - 1, dim + 1 -> dim - 2.
+        // the code supports repeated reflections, although this
+        // would only be needed in pathological cases.
+        if (s_in_wave < 0)
+          s_in_wave = -s_in_wave - 1;
+        else
+          s_in_wave = 2 * wave_dim - 1 - s_in_wave;
+      }
+      (*window)[s] = wave[s_in_wave];
+    }
+  }
+
+  ProcessWindow(opts, window_function, window->data(), log_energy_pre_window);
+}
+
+static void RemoveDcOffset(float *d, int32_t n) {
+  float sum = 0;
+  for (int32_t i = 0; i != n; ++i) {
+    sum += d[i];
+  }
+
+  float mean = sum / n;
+
+  for (int32_t i = 0; i != n; ++i) {
+    d[i] -= mean;
+  }
+}
+
+float InnerProduct(const float *a, const float *b, int32_t n) {
+  float sum = 0;
+  for (int32_t i = 0; i != n; ++i) {
+    sum += a[i] * b[i];
+  }
+  return sum;
+}
+
+static void Preemphasize(float *d, int32_t n, float preemph_coeff) {
+  if (preemph_coeff == 0.0) {
+    return;
+  }
+
+  KNF_CHECK(preemph_coeff >= 0.0 && preemph_coeff <= 1.0);
+
+  for (int32_t i = n - 1; i > 0; --i) {
+    d[i] -= preemph_coeff * d[i - 1];
+  }
+  d[0] -= preemph_coeff * d[0];
+}
+
+void ProcessWindow(const FrameExtractionOptions &opts,
+                   const FeatureWindowFunction &window_function, float *window,
+                   float *log_energy_pre_window /*= nullptr*/) {
+  int32_t frame_length = opts.WindowSize();
+
+  if (opts.remove_dc_offset) {
+    RemoveDcOffset(window, frame_length);
+  }
+
+  if (log_energy_pre_window != NULL) {
+    float energy = std::max<float>(InnerProduct(window, window, frame_length),
+                                   std::numeric_limits<float>::epsilon());
+    *log_energy_pre_window = std::log(energy);
+  }
+
+  if (opts.preemph_coeff != 0.0) {
+    Preemphasize(window, frame_length, opts.preemph_coeff);
+  }
+
+  window_function.Apply(window);
+}
+
+}  // namespace knf

+ 172 - 0
ggml/examples/kaldi-native-fbank/csrc/feature-window.h

@@ -0,0 +1,172 @@
+// kaldi-native-fbank/csrc/feature-window.h
+//
+// Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
+
+// This file is copied/modified from kaldi/src/feat/feature-window.h
+
+#ifndef KALDI_NATIVE_FBANK_CSRC_FEATURE_WINDOW_H_
+#define KALDI_NATIVE_FBANK_CSRC_FEATURE_WINDOW_H_
+
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "log.h"
+
+namespace knf {
+
+inline int32_t RoundUpToNearestPowerOfTwo(int32_t n) {
+  // copied from kaldi/src/base/kaldi-math.cc
+  KNF_CHECK_GT(n, 0);
+  n--;
+  n |= n >> 1;
+  n |= n >> 2;
+  n |= n >> 4;
+  n |= n >> 8;
+  n |= n >> 16;
+  return n + 1;
+}
+
+struct FrameExtractionOptions {
+  float samp_freq = 16000;
+  float frame_shift_ms = 10.0f;   // in milliseconds.
+  float frame_length_ms = 25.0f;  // in milliseconds.
+  float dither = 1.0f;            // Amount of dithering, 0.0 means no dither.
+  float preemph_coeff = 0.97f;    // Preemphasis coefficient.
+  bool remove_dc_offset = true;   // Subtract mean of wave before FFT.
+  std::string window_type = "povey";  // e.g. Hamming window
+  // May be "hamming", "rectangular", "povey", "hanning", "sine", "blackman"
+  // "povey" is a window I made to be similar to Hamming but to go to zero at
+  // the edges, it's pow((0.5 - 0.5*cos(n/N*2*pi)), 0.85) I just don't think the
+  // Hamming window makes sense as a windowing function.
+  bool round_to_power_of_two = true;
+  float blackman_coeff = 0.42f;
+  bool snip_edges = true;
+  // bool allow_downsample = false;
+  // bool allow_upsample = false;
+
+  int32_t WindowShift() const {
+    return static_cast<int32_t>(samp_freq * 0.001f * frame_shift_ms);
+  }
+  int32_t WindowSize() const {
+    return static_cast<int32_t>(samp_freq * 0.001f * frame_length_ms);
+  }
+  int32_t PaddedWindowSize() const {
+    return (round_to_power_of_two ? RoundUpToNearestPowerOfTwo(WindowSize())
+                                  : WindowSize());
+  }
+  std::string ToString() const {
+    std::ostringstream os;
+#define KNF_PRINT(x) os << #x << ": " << x << "\n"
+    KNF_PRINT(samp_freq);
+    KNF_PRINT(frame_shift_ms);
+    KNF_PRINT(frame_length_ms);
+    KNF_PRINT(dither);
+    KNF_PRINT(preemph_coeff);
+    KNF_PRINT(remove_dc_offset);
+    KNF_PRINT(window_type);
+    KNF_PRINT(round_to_power_of_two);
+    KNF_PRINT(blackman_coeff);
+    KNF_PRINT(snip_edges);
+    // KNF_PRINT(allow_downsample);
+    // KNF_PRINT(allow_upsample);
+#undef KNF_PRINT
+    return os.str();
+  }
+};
+
+std::ostream &operator<<(std::ostream &os, const FrameExtractionOptions &opts);
+
+class FeatureWindowFunction {
+ public:
+  FeatureWindowFunction() = default;
+  explicit FeatureWindowFunction(const FrameExtractionOptions &opts);
+  /**
+   * @param wave Pointer to a 1-D array of shape [window_size].
+   *             It is modified in-place: wave[i] = wave[i] * window_[i].
+   * @param
+   */
+  void Apply(float *wave) const;
+
+ private:
+  std::vector<float> window_;  // of size opts.WindowSize()
+};
+
+int64_t FirstSampleOfFrame(int32_t frame, const FrameExtractionOptions &opts);
+
+/**
+   This function returns the number of frames that we can extract from a wave
+   file with the given number of samples in it (assumed to have the same
+   sampling rate as specified in 'opts').
+
+      @param [in] num_samples  The number of samples in the wave file.
+      @param [in] opts     The frame-extraction options class
+
+      @param [in] flush   True if we are asserting that this number of samples
+   is 'all there is', false if we expecting more data to possibly come in.  This
+   only makes a difference to the answer
+   if opts.snips_edges== false.  For offline feature extraction you always want
+   flush == true.  In an online-decoding context, once you know (or decide) that
+   no more data is coming in, you'd call it with flush == true at the end to
+   flush out any remaining data.
+*/
+int32_t NumFrames(int64_t num_samples, const FrameExtractionOptions &opts,
+                  bool flush = true);
+
+/*
+  ExtractWindow() extracts a windowed frame of waveform (possibly with a
+  power-of-two, padded size, depending on the config), including all the
+  processing done by ProcessWindow().
+
+  @param [in] sample_offset  If 'wave' is not the entire waveform, but
+                   part of it to the left has been discarded, then the
+                   number of samples prior to 'wave' that we have
+                   already discarded.  Set this to zero if you are
+                   processing the entire waveform in one piece, or
+                   if you get 'no matching function' compilation
+                   errors when updating the code.
+  @param [in] wave  The waveform
+  @param [in] f     The frame index to be extracted, with
+                    0 <= f < NumFrames(sample_offset + wave.Dim(), opts, true)
+  @param [in] opts  The options class to be used
+  @param [in] window_function  The windowing function, as derived from the
+                    options class.
+  @param [out] window  The windowed, possibly-padded waveform to be
+                     extracted.  Will be resized as needed.
+  @param [out] log_energy_pre_window  If non-NULL, the log-energy of
+                   the signal prior to pre-emphasis and multiplying by
+                   the windowing function will be written to here.
+*/
+void ExtractWindow(int64_t sample_offset, const float *wave, std::size_t wave_size,
+                   int32_t f, const FrameExtractionOptions &opts,
+                   const FeatureWindowFunction &window_function,
+                   std::vector<float> *window,
+                   float *log_energy_pre_window = nullptr);
+
+/**
+  This function does all the windowing steps after actually
+  extracting the windowed signal: depending on the
+  configuration, it does dithering, dc offset removal,
+  preemphasis, and multiplication by the windowing function.
+   @param [in] opts  The options class to be used
+   @param [in] window_function  The windowing function-- should have
+                    been initialized using 'opts'.
+   @param [in,out] window  A vector of size opts.WindowSize().  Note:
+      it will typically be a sub-vector of a larger vector of size
+      opts.PaddedWindowSize(), with the remaining samples zero,
+      as the FFT code is more efficient if it operates on data with
+      power-of-two size.
+   @param [out]   log_energy_pre_window If non-NULL, then after dithering and
+      DC offset removal, this function will write to this pointer the log of
+      the total energy (i.e. sum-squared) of the frame.
+ */
+void ProcessWindow(const FrameExtractionOptions &opts,
+                   const FeatureWindowFunction &window_function, float *window,
+                   float *log_energy_pre_window = nullptr);
+
+// Compute the inner product of two vectors
+float InnerProduct(const float *a, const float *b, int32_t n);
+
+}  // namespace knf
+
+#endif  // KALDI_NATIVE_FBANK_CSRC_FEATURE_WINDOW_H_

+ 2975 - 0
ggml/examples/kaldi-native-fbank/csrc/fftsg.c

@@ -0,0 +1,2975 @@
+/* This file is copied from
+ *
+ * https://www.kurims.kyoto-u.ac.jp/~ooura/fft.html
+ *
+ * Copyright Takuya OOURA, 1996-2001
+ *
+ * You may use, copy, modify and distribute this code for any
+ * purpose (include commercial use) and without fee. Please refer to
+ * this package when you modify this code.
+ */
+/*
+Fast Fourier/Cosine/Sine Transform
+    dimension   :one
+    data length :power of 2
+    decimation  :frequency
+    radix       :split-radix
+    data        :inplace
+    table       :use
+functions
+    cdft: Complex Discrete Fourier Transform
+    rdft: Real Discrete Fourier Transform
+    ddct: Discrete Cosine Transform
+    ddst: Discrete Sine Transform
+    dfct: Cosine Transform of RDFT (Real Symmetric DFT)
+    dfst: Sine Transform of RDFT (Real Anti-symmetric DFT)
+function prototypes
+    void cdft(int, int, double *, int *, double *);
+    void rdft(int, int, double *, int *, double *);
+    void ddct(int, int, double *, int *, double *);
+    void ddst(int, int, double *, int *, double *);
+    void dfct(int, double *, double *, int *, double *);
+    void dfst(int, double *, double *, int *, double *);
+macro definitions
+    USE_CDFT_PTHREADS : default=not defined
+        CDFT_THREADS_BEGIN_N  : must be >= 512, default=8192
+        CDFT_4THREADS_BEGIN_N : must be >= 512, default=65536
+    USE_CDFT_WINTHREADS : default=not defined
+        CDFT_THREADS_BEGIN_N  : must be >= 512, default=32768
+        CDFT_4THREADS_BEGIN_N : must be >= 512, default=524288
+
+
+-------- Complex DFT (Discrete Fourier Transform) --------
+    [definition]
+        <case1>
+            X[k] = sum_j=0^n-1 x[j]*exp(2*pi*i*j*k/n), 0<=k<n
+        <case2>
+            X[k] = sum_j=0^n-1 x[j]*exp(-2*pi*i*j*k/n), 0<=k<n
+        (notes: sum_j=0^n-1 is a summation from j=0 to n-1)
+    [usage]
+        <case1>
+            ip[0] = 0; // first time only
+            cdft(2*n, 1, a, ip, w);
+        <case2>
+            ip[0] = 0; // first time only
+            cdft(2*n, -1, a, ip, w);
+    [parameters]
+        2*n            :data length (int)
+                        n >= 1, n = power of 2
+        a[0...2*n-1]   :input/output data (double *)
+                        input data
+                            a[2*j] = Re(x[j]),
+                            a[2*j+1] = Im(x[j]), 0<=j<n
+                        output data
+                            a[2*k] = Re(X[k]),
+                            a[2*k+1] = Im(X[k]), 0<=k<n
+        ip[0...*]      :work area for bit reversal (int *)
+                        length of ip >= 2+sqrt(n)
+                        strictly,
+                        length of ip >=
+                            2+(1<<(int)(log(n+0.5)/log(2))/2).
+                        ip[0],ip[1] are pointers of the cos/sin table.
+        w[0...n/2-1]   :cos/sin table (double *)
+                        w[],ip[] are initialized if ip[0] == 0.
+    [remark]
+        Inverse of
+            cdft(2*n, -1, a, ip, w);
+        is
+            cdft(2*n, 1, a, ip, w);
+            for (j = 0; j <= 2 * n - 1; j++) {
+                a[j] *= 1.0 / n;
+            }
+        .
+
+
+-------- Real DFT / Inverse of Real DFT --------
+    [definition]
+        <case1> RDFT
+            R[k] = sum_j=0^n-1 a[j]*cos(2*pi*j*k/n), 0<=k<=n/2
+            I[k] = sum_j=0^n-1 a[j]*sin(2*pi*j*k/n), 0<k<n/2
+        <case2> IRDFT (excluding scale)
+            a[k] = (R[0] + R[n/2]*cos(pi*k))/2 +
+                   sum_j=1^n/2-1 R[j]*cos(2*pi*j*k/n) +
+                   sum_j=1^n/2-1 I[j]*sin(2*pi*j*k/n), 0<=k<n
+    [usage]
+        <case1>
+            ip[0] = 0; // first time only
+            rdft(n, 1, a, ip, w);
+        <case2>
+            ip[0] = 0; // first time only
+            rdft(n, -1, a, ip, w);
+    [parameters]
+        n              :data length (int)
+                        n >= 2, n = power of 2
+        a[0...n-1]     :input/output data (double *)
+                        <case1>
+                            output data
+                                a[2*k] = R[k], 0<=k<n/2
+                                a[2*k+1] = I[k], 0<k<n/2
+                                a[1] = R[n/2]
+                        <case2>
+                            input data
+                                a[2*j] = R[j], 0<=j<n/2
+                                a[2*j+1] = I[j], 0<j<n/2
+                                a[1] = R[n/2]
+        ip[0...*]      :work area for bit reversal (int *)
+                        length of ip >= 2+sqrt(n/2)
+                        strictly,
+                        length of ip >=
+                            2+(1<<(int)(log(n/2+0.5)/log(2))/2).
+                        ip[0],ip[1] are pointers of the cos/sin table.
+        w[0...n/2-1]   :cos/sin table (double *)
+                        w[],ip[] are initialized if ip[0] == 0.
+    [remark]
+        Inverse of
+            rdft(n, 1, a, ip, w);
+        is
+            rdft(n, -1, a, ip, w);
+            for (j = 0; j <= n - 1; j++) {
+                a[j] *= 2.0 / n;
+            }
+        .
+
+
+-------- DCT (Discrete Cosine Transform) / Inverse of DCT --------
+    [definition]
+        <case1> IDCT (excluding scale)
+            C[k] = sum_j=0^n-1 a[j]*cos(pi*j*(k+1/2)/n), 0<=k<n
+        <case2> DCT
+            C[k] = sum_j=0^n-1 a[j]*cos(pi*(j+1/2)*k/n), 0<=k<n
+    [usage]
+        <case1>
+            ip[0] = 0; // first time only
+            ddct(n, 1, a, ip, w);
+        <case2>
+            ip[0] = 0; // first time only
+            ddct(n, -1, a, ip, w);
+    [parameters]
+        n              :data length (int)
+                        n >= 2, n = power of 2
+        a[0...n-1]     :input/output data (double *)
+                        output data
+                            a[k] = C[k], 0<=k<n
+        ip[0...*]      :work area for bit reversal (int *)
+                        length of ip >= 2+sqrt(n/2)
+                        strictly,
+                        length of ip >=
+                            2+(1<<(int)(log(n/2+0.5)/log(2))/2).
+                        ip[0],ip[1] are pointers of the cos/sin table.
+        w[0...n*5/4-1] :cos/sin table (double *)
+                        w[],ip[] are initialized if ip[0] == 0.
+    [remark]
+        Inverse of
+            ddct(n, -1, a, ip, w);
+        is
+            a[0] *= 0.5;
+            ddct(n, 1, a, ip, w);
+            for (j = 0; j <= n - 1; j++) {
+                a[j] *= 2.0 / n;
+            }
+        .
+
+
+-------- DST (Discrete Sine Transform) / Inverse of DST --------
+    [definition]
+        <case1> IDST (excluding scale)
+            S[k] = sum_j=1^n A[j]*sin(pi*j*(k+1/2)/n), 0<=k<n
+        <case2> DST
+            S[k] = sum_j=0^n-1 a[j]*sin(pi*(j+1/2)*k/n), 0<k<=n
+    [usage]
+        <case1>
+            ip[0] = 0; // first time only
+            ddst(n, 1, a, ip, w);
+        <case2>
+            ip[0] = 0; // first time only
+            ddst(n, -1, a, ip, w);
+    [parameters]
+        n              :data length (int)
+                        n >= 2, n = power of 2
+        a[0...n-1]     :input/output data (double *)
+                        <case1>
+                            input data
+                                a[j] = A[j], 0<j<n
+                                a[0] = A[n]
+                            output data
+                                a[k] = S[k], 0<=k<n
+                        <case2>
+                            output data
+                                a[k] = S[k], 0<k<n
+                                a[0] = S[n]
+        ip[0...*]      :work area for bit reversal (int *)
+                        length of ip >= 2+sqrt(n/2)
+                        strictly,
+                        length of ip >=
+                            2+(1<<(int)(log(n/2+0.5)/log(2))/2).
+                        ip[0],ip[1] are pointers of the cos/sin table.
+        w[0...n*5/4-1] :cos/sin table (double *)
+                        w[],ip[] are initialized if ip[0] == 0.
+    [remark]
+        Inverse of
+            ddst(n, -1, a, ip, w);
+        is
+            a[0] *= 0.5;
+            ddst(n, 1, a, ip, w);
+            for (j = 0; j <= n - 1; j++) {
+                a[j] *= 2.0 / n;
+            }
+        .
+
+
+-------- Cosine Transform of RDFT (Real Symmetric DFT) --------
+    [definition]
+        C[k] = sum_j=0^n a[j]*cos(pi*j*k/n), 0<=k<=n
+    [usage]
+        ip[0] = 0; // first time only
+        dfct(n, a, t, ip, w);
+    [parameters]
+        n              :data length - 1 (int)
+                        n >= 2, n = power of 2
+        a[0...n]       :input/output data (double *)
+                        output data
+                            a[k] = C[k], 0<=k<=n
+        t[0...n/2]     :work area (double *)
+        ip[0...*]      :work area for bit reversal (int *)
+                        length of ip >= 2+sqrt(n/4)
+                        strictly,
+                        length of ip >=
+                            2+(1<<(int)(log(n/4+0.5)/log(2))/2).
+                        ip[0],ip[1] are pointers of the cos/sin table.
+        w[0...n*5/8-1] :cos/sin table (double *)
+                        w[],ip[] are initialized if ip[0] == 0.
+    [remark]
+        Inverse of
+            a[0] *= 0.5;
+            a[n] *= 0.5;
+            dfct(n, a, t, ip, w);
+        is
+            a[0] *= 0.5;
+            a[n] *= 0.5;
+            dfct(n, a, t, ip, w);
+            for (j = 0; j <= n; j++) {
+                a[j] *= 2.0 / n;
+            }
+        .
+
+
+-------- Sine Transform of RDFT (Real Anti-symmetric DFT) --------
+    [definition]
+        S[k] = sum_j=1^n-1 a[j]*sin(pi*j*k/n), 0<k<n
+    [usage]
+        ip[0] = 0; // first time only
+        dfst(n, a, t, ip, w);
+    [parameters]
+        n              :data length + 1 (int)
+                        n >= 2, n = power of 2
+        a[0...n-1]     :input/output data (double *)
+                        output data
+                            a[k] = S[k], 0<k<n
+                        (a[0] is used for work area)
+        t[0...n/2-1]   :work area (double *)
+        ip[0...*]      :work area for bit reversal (int *)
+                        length of ip >= 2+sqrt(n/4)
+                        strictly,
+                        length of ip >=
+                            2+(1<<(int)(log(n/4+0.5)/log(2))/2).
+                        ip[0],ip[1] are pointers of the cos/sin table.
+        w[0...n*5/8-1] :cos/sin table (double *)
+                        w[],ip[] are initialized if ip[0] == 0.
+    [remark]
+        Inverse of
+            dfst(n, a, t, ip, w);
+        is
+            dfst(n, a, t, ip, w);
+            for (j = 1; j <= n - 1; j++) {
+                a[j] *= 2.0 / n;
+            }
+        .
+
+
+Appendix :
+    The cos/sin table is recalculated when the larger table required.
+    w[] and ip[] are compatible with all routines.
+*/
+
+
+
+void rdft(int n, int isgn, double *a, int *ip, double *w)
+{
+    void makewt(int nw, int *ip, double *w);
+    void makect(int nc, int *ip, double *c);
+    void cftfsub(int n, double *a, int *ip, int nw, double *w);
+    void cftbsub(int n, double *a, int *ip, int nw, double *w);
+    void rftfsub(int n, double *a, int nc, double *c);
+    void rftbsub(int n, double *a, int nc, double *c);
+    int nw, nc;
+    double xi;
+
+    nw = ip[0];
+    if (n > (nw << 2)) {
+        nw = n >> 2;
+        makewt(nw, ip, w);
+    }
+    nc = ip[1];
+    if (n > (nc << 2)) {
+        nc = n >> 2;
+        makect(nc, ip, w + nw);
+    }
+    if (isgn >= 0) {
+        if (n > 4) {
+            cftfsub(n, a, ip, nw, w);
+            rftfsub(n, a, nc, w + nw);
+        } else if (n == 4) {
+            cftfsub(n, a, ip, nw, w);
+        }
+        xi = a[0] - a[1];
+        a[0] += a[1];
+        a[1] = xi;
+    } else {
+        a[1] = 0.5 * (a[0] - a[1]);
+        a[0] -= a[1];
+        if (n > 4) {
+            rftbsub(n, a, nc, w + nw);
+            cftbsub(n, a, ip, nw, w);
+        } else if (n == 4) {
+            cftbsub(n, a, ip, nw, w);
+        }
+    }
+}
+
+
+/* -------- initializing routines -------- */
+
+
+#include <math.h>
+
+void makewt(int nw, int *ip, double *w)
+{
+    void makeipt(int nw, int *ip);
+    int j, nwh, nw0, nw1;
+    double delta, wn4r, wk1r, wk1i, wk3r, wk3i;
+
+    ip[0] = nw;
+    ip[1] = 1;
+    if (nw > 2) {
+        nwh = nw >> 1;
+        delta = atan(1.0) / nwh;
+        wn4r = cos(delta * nwh);
+        w[0] = 1;
+        w[1] = wn4r;
+        if (nwh == 4) {
+            w[2] = cos(delta * 2);
+            w[3] = sin(delta * 2);
+        } else if (nwh > 4) {
+            makeipt(nw, ip);
+            w[2] = 0.5 / cos(delta * 2);
+            w[3] = 0.5 / cos(delta * 6);
+            for (j = 4; j < nwh; j += 4) {
+                w[j] = cos(delta * j);
+                w[j + 1] = sin(delta * j);
+                w[j + 2] = cos(3 * delta * j);
+                w[j + 3] = -sin(3 * delta * j);
+            }
+        }
+        nw0 = 0;
+        while (nwh > 2) {
+            nw1 = nw0 + nwh;
+            nwh >>= 1;
+            w[nw1] = 1;
+            w[nw1 + 1] = wn4r;
+            if (nwh == 4) {
+                wk1r = w[nw0 + 4];
+                wk1i = w[nw0 + 5];
+                w[nw1 + 2] = wk1r;
+                w[nw1 + 3] = wk1i;
+            } else if (nwh > 4) {
+                wk1r = w[nw0 + 4];
+                wk3r = w[nw0 + 6];
+                w[nw1 + 2] = 0.5 / wk1r;
+                w[nw1 + 3] = 0.5 / wk3r;
+                for (j = 4; j < nwh; j += 4) {
+                    wk1r = w[nw0 + 2 * j];
+                    wk1i = w[nw0 + 2 * j + 1];
+                    wk3r = w[nw0 + 2 * j + 2];
+                    wk3i = w[nw0 + 2 * j + 3];
+                    w[nw1 + j] = wk1r;
+                    w[nw1 + j + 1] = wk1i;
+                    w[nw1 + j + 2] = wk3r;
+                    w[nw1 + j + 3] = wk3i;
+                }
+            }
+            nw0 = nw1;
+        }
+    }
+}
+
+
+void makeipt(int nw, int *ip)
+{
+    int j, l, m, m2, p, q;
+
+    ip[2] = 0;
+    ip[3] = 16;
+    m = 2;
+    for (l = nw; l > 32; l >>= 2) {
+        m2 = m << 1;
+        q = m2 << 3;
+        for (j = m; j < m2; j++) {
+            p = ip[j] << 2;
+            ip[m + j] = p;
+            ip[m2 + j] = p + q;
+        }
+        m = m2;
+    }
+}
+
+
+void makect(int nc, int *ip, double *c)
+{
+    int j, nch;
+    double delta;
+
+    ip[1] = nc;
+    if (nc > 1) {
+        nch = nc >> 1;
+        delta = atan(1.0) / nch;
+        c[0] = cos(delta * nch);
+        c[nch] = 0.5 * c[0];
+        for (j = 1; j < nch; j++) {
+            c[j] = 0.5 * cos(delta * j);
+            c[nc - j] = 0.5 * sin(delta * j);
+        }
+    }
+}
+
+
+/* -------- child routines -------- */
+
+
+#ifdef USE_CDFT_PTHREADS
+#define USE_CDFT_THREADS
+#ifndef CDFT_THREADS_BEGIN_N
+#define CDFT_THREADS_BEGIN_N 8192
+#endif
+#ifndef CDFT_4THREADS_BEGIN_N
+#define CDFT_4THREADS_BEGIN_N 65536
+#endif
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#define cdft_thread_t pthread_t
+#define cdft_thread_create(thp,func,argp) { \
+    if (pthread_create(thp, NULL, func, (void *) argp) != 0) { \
+        fprintf(stderr, "cdft thread error\n"); \
+        exit(1); \
+    } \
+}
+#define cdft_thread_wait(th) { \
+    if (pthread_join(th, NULL) != 0) { \
+        fprintf(stderr, "cdft thread error\n"); \
+        exit(1); \
+    } \
+}
+#endif /* USE_CDFT_PTHREADS */
+
+
+#ifdef USE_CDFT_WINTHREADS
+#define USE_CDFT_THREADS
+#ifndef CDFT_THREADS_BEGIN_N
+#define CDFT_THREADS_BEGIN_N 32768
+#endif
+#ifndef CDFT_4THREADS_BEGIN_N
+#define CDFT_4THREADS_BEGIN_N 524288
+#endif
+#include <windows.h>
+#include <stdio.h>
+#include <stdlib.h>
+#define cdft_thread_t HANDLE
+#define cdft_thread_create(thp,func,argp) { \
+    DWORD thid; \
+    *(thp) = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, (LPVOID) argp, 0, &thid); \
+    if (*(thp) == 0) { \
+        fprintf(stderr, "cdft thread error\n"); \
+        exit(1); \
+    } \
+}
+#define cdft_thread_wait(th) { \
+    WaitForSingleObject(th, INFINITE); \
+    CloseHandle(th); \
+}
+#endif /* USE_CDFT_WINTHREADS */
+
+
+void cftfsub(int n, double *a, int *ip, int nw, double *w)
+{
+    void bitrv2(int n, int *ip, double *a);
+    void bitrv216(double *a);
+    void bitrv208(double *a);
+    void cftf1st(int n, double *a, double *w);
+    void cftrec4(int n, double *a, int nw, double *w);
+    void cftleaf(int n, int isplt, double *a, int nw, double *w);
+    void cftfx41(int n, double *a, int nw, double *w);
+    void cftf161(double *a, double *w);
+    void cftf081(double *a, double *w);
+    void cftf040(double *a);
+    void cftx020(double *a);
+#ifdef USE_CDFT_THREADS
+    void cftrec4_th(int n, double *a, int nw, double *w);
+#endif /* USE_CDFT_THREADS */
+
+    if (n > 8) {
+        if (n > 32) {
+            cftf1st(n, a, &w[nw - (n >> 2)]);
+#ifdef USE_CDFT_THREADS
+            if (n > CDFT_THREADS_BEGIN_N) {
+                cftrec4_th(n, a, nw, w);
+            } else
+#endif /* USE_CDFT_THREADS */
+            if (n > 512) {
+                cftrec4(n, a, nw, w);
+            } else if (n > 128) {
+                cftleaf(n, 1, a, nw, w);
+            } else {
+                cftfx41(n, a, nw, w);
+            }
+            bitrv2(n, ip, a);
+        } else if (n == 32) {
+            cftf161(a, &w[nw - 8]);
+            bitrv216(a);
+        } else {
+            cftf081(a, w);
+            bitrv208(a);
+        }
+    } else if (n == 8) {
+        cftf040(a);
+    } else if (n == 4) {
+        cftx020(a);
+    }
+}
+
+
+void cftbsub(int n, double *a, int *ip, int nw, double *w)
+{
+    void bitrv2conj(int n, int *ip, double *a);
+    void bitrv216neg(double *a);
+    void bitrv208neg(double *a);
+    void cftb1st(int n, double *a, double *w);
+    void cftrec4(int n, double *a, int nw, double *w);
+    void cftleaf(int n, int isplt, double *a, int nw, double *w);
+    void cftfx41(int n, double *a, int nw, double *w);
+    void cftf161(double *a, double *w);
+    void cftf081(double *a, double *w);
+    void cftb040(double *a);
+    void cftx020(double *a);
+#ifdef USE_CDFT_THREADS
+    void cftrec4_th(int n, double *a, int nw, double *w);
+#endif /* USE_CDFT_THREADS */
+
+    if (n > 8) {
+        if (n > 32) {
+            cftb1st(n, a, &w[nw - (n >> 2)]);
+#ifdef USE_CDFT_THREADS
+            if (n > CDFT_THREADS_BEGIN_N) {
+                cftrec4_th(n, a, nw, w);
+            } else
+#endif /* USE_CDFT_THREADS */
+            if (n > 512) {
+                cftrec4(n, a, nw, w);
+            } else if (n > 128) {
+                cftleaf(n, 1, a, nw, w);
+            } else {
+                cftfx41(n, a, nw, w);
+            }
+            bitrv2conj(n, ip, a);
+        } else if (n == 32) {
+            cftf161(a, &w[nw - 8]);
+            bitrv216neg(a);
+        } else {
+            cftf081(a, w);
+            bitrv208neg(a);
+        }
+    } else if (n == 8) {
+        cftb040(a);
+    } else if (n == 4) {
+        cftx020(a);
+    }
+}
+
+
+void bitrv2(int n, int *ip, double *a)
+{
+    int j, j1, k, k1, l, m, nh, nm;
+    double xr, xi, yr, yi;
+
+    m = 1;
+    for (l = n >> 2; l > 8; l >>= 2) {
+        m <<= 1;
+    }
+    nh = n >> 1;
+    nm = 4 * m;
+    if (l == 8) {
+        for (k = 0; k < m; k++) {
+            for (j = 0; j < k; j++) {
+                j1 = 4 * j + 2 * ip[m + k];
+                k1 = 4 * k + 2 * ip[m + j];
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nm;
+                k1 += 2 * nm;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nm;
+                k1 -= nm;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nm;
+                k1 += 2 * nm;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nh;
+                k1 += 2;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nm;
+                k1 -= 2 * nm;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nm;
+                k1 += nm;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nm;
+                k1 -= 2 * nm;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += 2;
+                k1 += nh;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nm;
+                k1 += 2 * nm;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nm;
+                k1 -= nm;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nm;
+                k1 += 2 * nm;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nh;
+                k1 -= 2;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nm;
+                k1 -= 2 * nm;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nm;
+                k1 += nm;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nm;
+                k1 -= 2 * nm;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+            }
+            k1 = 4 * k + 2 * ip[m + k];
+            j1 = k1 + 2;
+            k1 += nh;
+            xr = a[j1];
+            xi = a[j1 + 1];
+            yr = a[k1];
+            yi = a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            j1 += nm;
+            k1 += 2 * nm;
+            xr = a[j1];
+            xi = a[j1 + 1];
+            yr = a[k1];
+            yi = a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            j1 += nm;
+            k1 -= nm;
+            xr = a[j1];
+            xi = a[j1 + 1];
+            yr = a[k1];
+            yi = a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            j1 -= 2;
+            k1 -= nh;
+            xr = a[j1];
+            xi = a[j1 + 1];
+            yr = a[k1];
+            yi = a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            j1 += nh + 2;
+            k1 += nh + 2;
+            xr = a[j1];
+            xi = a[j1 + 1];
+            yr = a[k1];
+            yi = a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            j1 -= nh - nm;
+            k1 += 2 * nm - 2;
+            xr = a[j1];
+            xi = a[j1 + 1];
+            yr = a[k1];
+            yi = a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+        }
+    } else {
+        for (k = 0; k < m; k++) {
+            for (j = 0; j < k; j++) {
+                j1 = 4 * j + ip[m + k];
+                k1 = 4 * k + ip[m + j];
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nm;
+                k1 += nm;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nh;
+                k1 += 2;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nm;
+                k1 -= nm;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += 2;
+                k1 += nh;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nm;
+                k1 += nm;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nh;
+                k1 -= 2;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nm;
+                k1 -= nm;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+            }
+            k1 = 4 * k + ip[m + k];
+            j1 = k1 + 2;
+            k1 += nh;
+            xr = a[j1];
+            xi = a[j1 + 1];
+            yr = a[k1];
+            yi = a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            j1 += nm;
+            k1 += nm;
+            xr = a[j1];
+            xi = a[j1 + 1];
+            yr = a[k1];
+            yi = a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+        }
+    }
+}
+
+
+void bitrv2conj(int n, int *ip, double *a)
+{
+    int j, j1, k, k1, l, m, nh, nm;
+    double xr, xi, yr, yi;
+
+    m = 1;
+    for (l = n >> 2; l > 8; l >>= 2) {
+        m <<= 1;
+    }
+    nh = n >> 1;
+    nm = 4 * m;
+    if (l == 8) {
+        for (k = 0; k < m; k++) {
+            for (j = 0; j < k; j++) {
+                j1 = 4 * j + 2 * ip[m + k];
+                k1 = 4 * k + 2 * ip[m + j];
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nm;
+                k1 += 2 * nm;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nm;
+                k1 -= nm;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nm;
+                k1 += 2 * nm;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nh;
+                k1 += 2;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nm;
+                k1 -= 2 * nm;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nm;
+                k1 += nm;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nm;
+                k1 -= 2 * nm;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += 2;
+                k1 += nh;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nm;
+                k1 += 2 * nm;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nm;
+                k1 -= nm;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nm;
+                k1 += 2 * nm;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nh;
+                k1 -= 2;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nm;
+                k1 -= 2 * nm;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nm;
+                k1 += nm;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nm;
+                k1 -= 2 * nm;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+            }
+            k1 = 4 * k + 2 * ip[m + k];
+            j1 = k1 + 2;
+            k1 += nh;
+            a[j1 - 1] = -a[j1 - 1];
+            xr = a[j1];
+            xi = -a[j1 + 1];
+            yr = a[k1];
+            yi = -a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            a[k1 + 3] = -a[k1 + 3];
+            j1 += nm;
+            k1 += 2 * nm;
+            xr = a[j1];
+            xi = -a[j1 + 1];
+            yr = a[k1];
+            yi = -a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            j1 += nm;
+            k1 -= nm;
+            xr = a[j1];
+            xi = -a[j1 + 1];
+            yr = a[k1];
+            yi = -a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            j1 -= 2;
+            k1 -= nh;
+            xr = a[j1];
+            xi = -a[j1 + 1];
+            yr = a[k1];
+            yi = -a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            j1 += nh + 2;
+            k1 += nh + 2;
+            xr = a[j1];
+            xi = -a[j1 + 1];
+            yr = a[k1];
+            yi = -a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            j1 -= nh - nm;
+            k1 += 2 * nm - 2;
+            a[j1 - 1] = -a[j1 - 1];
+            xr = a[j1];
+            xi = -a[j1 + 1];
+            yr = a[k1];
+            yi = -a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            a[k1 + 3] = -a[k1 + 3];
+        }
+    } else {
+        for (k = 0; k < m; k++) {
+            for (j = 0; j < k; j++) {
+                j1 = 4 * j + ip[m + k];
+                k1 = 4 * k + ip[m + j];
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nm;
+                k1 += nm;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nh;
+                k1 += 2;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nm;
+                k1 -= nm;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += 2;
+                k1 += nh;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nm;
+                k1 += nm;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nh;
+                k1 -= 2;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nm;
+                k1 -= nm;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+            }
+            k1 = 4 * k + ip[m + k];
+            j1 = k1 + 2;
+            k1 += nh;
+            a[j1 - 1] = -a[j1 - 1];
+            xr = a[j1];
+            xi = -a[j1 + 1];
+            yr = a[k1];
+            yi = -a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            a[k1 + 3] = -a[k1 + 3];
+            j1 += nm;
+            k1 += nm;
+            a[j1 - 1] = -a[j1 - 1];
+            xr = a[j1];
+            xi = -a[j1 + 1];
+            yr = a[k1];
+            yi = -a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            a[k1 + 3] = -a[k1 + 3];
+        }
+    }
+}
+
+
+void bitrv216(double *a)
+{
+    double x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i,
+        x5r, x5i, x7r, x7i, x8r, x8i, x10r, x10i,
+        x11r, x11i, x12r, x12i, x13r, x13i, x14r, x14i;
+
+    x1r = a[2];
+    x1i = a[3];
+    x2r = a[4];
+    x2i = a[5];
+    x3r = a[6];
+    x3i = a[7];
+    x4r = a[8];
+    x4i = a[9];
+    x5r = a[10];
+    x5i = a[11];
+    x7r = a[14];
+    x7i = a[15];
+    x8r = a[16];
+    x8i = a[17];
+    x10r = a[20];
+    x10i = a[21];
+    x11r = a[22];
+    x11i = a[23];
+    x12r = a[24];
+    x12i = a[25];
+    x13r = a[26];
+    x13i = a[27];
+    x14r = a[28];
+    x14i = a[29];
+    a[2] = x8r;
+    a[3] = x8i;
+    a[4] = x4r;
+    a[5] = x4i;
+    a[6] = x12r;
+    a[7] = x12i;
+    a[8] = x2r;
+    a[9] = x2i;
+    a[10] = x10r;
+    a[11] = x10i;
+    a[14] = x14r;
+    a[15] = x14i;
+    a[16] = x1r;
+    a[17] = x1i;
+    a[20] = x5r;
+    a[21] = x5i;
+    a[22] = x13r;
+    a[23] = x13i;
+    a[24] = x3r;
+    a[25] = x3i;
+    a[26] = x11r;
+    a[27] = x11i;
+    a[28] = x7r;
+    a[29] = x7i;
+}
+
+
+void bitrv216neg(double *a)
+{
+    double x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i,
+        x5r, x5i, x6r, x6i, x7r, x7i, x8r, x8i,
+        x9r, x9i, x10r, x10i, x11r, x11i, x12r, x12i,
+        x13r, x13i, x14r, x14i, x15r, x15i;
+
+    x1r = a[2];
+    x1i = a[3];
+    x2r = a[4];
+    x2i = a[5];
+    x3r = a[6];
+    x3i = a[7];
+    x4r = a[8];
+    x4i = a[9];
+    x5r = a[10];
+    x5i = a[11];
+    x6r = a[12];
+    x6i = a[13];
+    x7r = a[14];
+    x7i = a[15];
+    x8r = a[16];
+    x8i = a[17];
+    x9r = a[18];
+    x9i = a[19];
+    x10r = a[20];
+    x10i = a[21];
+    x11r = a[22];
+    x11i = a[23];
+    x12r = a[24];
+    x12i = a[25];
+    x13r = a[26];
+    x13i = a[27];
+    x14r = a[28];
+    x14i = a[29];
+    x15r = a[30];
+    x15i = a[31];
+    a[2] = x15r;
+    a[3] = x15i;
+    a[4] = x7r;
+    a[5] = x7i;
+    a[6] = x11r;
+    a[7] = x11i;
+    a[8] = x3r;
+    a[9] = x3i;
+    a[10] = x13r;
+    a[11] = x13i;
+    a[12] = x5r;
+    a[13] = x5i;
+    a[14] = x9r;
+    a[15] = x9i;
+    a[16] = x1r;
+    a[17] = x1i;
+    a[18] = x14r;
+    a[19] = x14i;
+    a[20] = x6r;
+    a[21] = x6i;
+    a[22] = x10r;
+    a[23] = x10i;
+    a[24] = x2r;
+    a[25] = x2i;
+    a[26] = x12r;
+    a[27] = x12i;
+    a[28] = x4r;
+    a[29] = x4i;
+    a[30] = x8r;
+    a[31] = x8i;
+}
+
+
+void bitrv208(double *a)
+{
+    double x1r, x1i, x3r, x3i, x4r, x4i, x6r, x6i;
+
+    x1r = a[2];
+    x1i = a[3];
+    x3r = a[6];
+    x3i = a[7];
+    x4r = a[8];
+    x4i = a[9];
+    x6r = a[12];
+    x6i = a[13];
+    a[2] = x4r;
+    a[3] = x4i;
+    a[6] = x6r;
+    a[7] = x6i;
+    a[8] = x1r;
+    a[9] = x1i;
+    a[12] = x3r;
+    a[13] = x3i;
+}
+
+
+void bitrv208neg(double *a)
+{
+    double x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i,
+        x5r, x5i, x6r, x6i, x7r, x7i;
+
+    x1r = a[2];
+    x1i = a[3];
+    x2r = a[4];
+    x2i = a[5];
+    x3r = a[6];
+    x3i = a[7];
+    x4r = a[8];
+    x4i = a[9];
+    x5r = a[10];
+    x5i = a[11];
+    x6r = a[12];
+    x6i = a[13];
+    x7r = a[14];
+    x7i = a[15];
+    a[2] = x7r;
+    a[3] = x7i;
+    a[4] = x3r;
+    a[5] = x3i;
+    a[6] = x5r;
+    a[7] = x5i;
+    a[8] = x1r;
+    a[9] = x1i;
+    a[10] = x6r;
+    a[11] = x6i;
+    a[12] = x2r;
+    a[13] = x2i;
+    a[14] = x4r;
+    a[15] = x4i;
+}
+
+
+void cftf1st(int n, double *a, double *w)
+{
+    int j, j0, j1, j2, j3, k, m, mh;
+    double wn4r, csc1, csc3, wk1r, wk1i, wk3r, wk3i,
+        wd1r, wd1i, wd3r, wd3i;
+    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i,
+        y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i;
+
+    mh = n >> 3;
+    m = 2 * mh;
+    j1 = m;
+    j2 = j1 + m;
+    j3 = j2 + m;
+    x0r = a[0] + a[j2];
+    x0i = a[1] + a[j2 + 1];
+    x1r = a[0] - a[j2];
+    x1i = a[1] - a[j2 + 1];
+    x2r = a[j1] + a[j3];
+    x2i = a[j1 + 1] + a[j3 + 1];
+    x3r = a[j1] - a[j3];
+    x3i = a[j1 + 1] - a[j3 + 1];
+    a[0] = x0r + x2r;
+    a[1] = x0i + x2i;
+    a[j1] = x0r - x2r;
+    a[j1 + 1] = x0i - x2i;
+    a[j2] = x1r - x3i;
+    a[j2 + 1] = x1i + x3r;
+    a[j3] = x1r + x3i;
+    a[j3 + 1] = x1i - x3r;
+    wn4r = w[1];
+    csc1 = w[2];
+    csc3 = w[3];
+    wd1r = 1;
+    wd1i = 0;
+    wd3r = 1;
+    wd3i = 0;
+    k = 0;
+    for (j = 2; j < mh - 2; j += 4) {
+        k += 4;
+        wk1r = csc1 * (wd1r + w[k]);
+        wk1i = csc1 * (wd1i + w[k + 1]);
+        wk3r = csc3 * (wd3r + w[k + 2]);
+        wk3i = csc3 * (wd3i + w[k + 3]);
+        wd1r = w[k];
+        wd1i = w[k + 1];
+        wd3r = w[k + 2];
+        wd3i = w[k + 3];
+        j1 = j + m;
+        j2 = j1 + m;
+        j3 = j2 + m;
+        x0r = a[j] + a[j2];
+        x0i = a[j + 1] + a[j2 + 1];
+        x1r = a[j] - a[j2];
+        x1i = a[j + 1] - a[j2 + 1];
+        y0r = a[j + 2] + a[j2 + 2];
+        y0i = a[j + 3] + a[j2 + 3];
+        y1r = a[j + 2] - a[j2 + 2];
+        y1i = a[j + 3] - a[j2 + 3];
+        x2r = a[j1] + a[j3];
+        x2i = a[j1 + 1] + a[j3 + 1];
+        x3r = a[j1] - a[j3];
+        x3i = a[j1 + 1] - a[j3 + 1];
+        y2r = a[j1 + 2] + a[j3 + 2];
+        y2i = a[j1 + 3] + a[j3 + 3];
+        y3r = a[j1 + 2] - a[j3 + 2];
+        y3i = a[j1 + 3] - a[j3 + 3];
+        a[j] = x0r + x2r;
+        a[j + 1] = x0i + x2i;
+        a[j + 2] = y0r + y2r;
+        a[j + 3] = y0i + y2i;
+        a[j1] = x0r - x2r;
+        a[j1 + 1] = x0i - x2i;
+        a[j1 + 2] = y0r - y2r;
+        a[j1 + 3] = y0i - y2i;
+        x0r = x1r - x3i;
+        x0i = x1i + x3r;
+        a[j2] = wk1r * x0r - wk1i * x0i;
+        a[j2 + 1] = wk1r * x0i + wk1i * x0r;
+        x0r = y1r - y3i;
+        x0i = y1i + y3r;
+        a[j2 + 2] = wd1r * x0r - wd1i * x0i;
+        a[j2 + 3] = wd1r * x0i + wd1i * x0r;
+        x0r = x1r + x3i;
+        x0i = x1i - x3r;
+        a[j3] = wk3r * x0r + wk3i * x0i;
+        a[j3 + 1] = wk3r * x0i - wk3i * x0r;
+        x0r = y1r + y3i;
+        x0i = y1i - y3r;
+        a[j3 + 2] = wd3r * x0r + wd3i * x0i;
+        a[j3 + 3] = wd3r * x0i - wd3i * x0r;
+        j0 = m - j;
+        j1 = j0 + m;
+        j2 = j1 + m;
+        j3 = j2 + m;
+        x0r = a[j0] + a[j2];
+        x0i = a[j0 + 1] + a[j2 + 1];
+        x1r = a[j0] - a[j2];
+        x1i = a[j0 + 1] - a[j2 + 1];
+        y0r = a[j0 - 2] + a[j2 - 2];
+        y0i = a[j0 - 1] + a[j2 - 1];
+        y1r = a[j0 - 2] - a[j2 - 2];
+        y1i = a[j0 - 1] - a[j2 - 1];
+        x2r = a[j1] + a[j3];
+        x2i = a[j1 + 1] + a[j3 + 1];
+        x3r = a[j1] - a[j3];
+        x3i = a[j1 + 1] - a[j3 + 1];
+        y2r = a[j1 - 2] + a[j3 - 2];
+        y2i = a[j1 - 1] + a[j3 - 1];
+        y3r = a[j1 - 2] - a[j3 - 2];
+        y3i = a[j1 - 1] - a[j3 - 1];
+        a[j0] = x0r + x2r;
+        a[j0 + 1] = x0i + x2i;
+        a[j0 - 2] = y0r + y2r;
+        a[j0 - 1] = y0i + y2i;
+        a[j1] = x0r - x2r;
+        a[j1 + 1] = x0i - x2i;
+        a[j1 - 2] = y0r - y2r;
+        a[j1 - 1] = y0i - y2i;
+        x0r = x1r - x3i;
+        x0i = x1i + x3r;
+        a[j2] = wk1i * x0r - wk1r * x0i;
+        a[j2 + 1] = wk1i * x0i + wk1r * x0r;
+        x0r = y1r - y3i;
+        x0i = y1i + y3r;
+        a[j2 - 2] = wd1i * x0r - wd1r * x0i;
+        a[j2 - 1] = wd1i * x0i + wd1r * x0r;
+        x0r = x1r + x3i;
+        x0i = x1i - x3r;
+        a[j3] = wk3i * x0r + wk3r * x0i;
+        a[j3 + 1] = wk3i * x0i - wk3r * x0r;
+        x0r = y1r + y3i;
+        x0i = y1i - y3r;
+        a[j3 - 2] = wd3i * x0r + wd3r * x0i;
+        a[j3 - 1] = wd3i * x0i - wd3r * x0r;
+    }
+    wk1r = csc1 * (wd1r + wn4r);
+    wk1i = csc1 * (wd1i + wn4r);
+    wk3r = csc3 * (wd3r - wn4r);
+    wk3i = csc3 * (wd3i - wn4r);
+    j0 = mh;
+    j1 = j0 + m;
+    j2 = j1 + m;
+    j3 = j2 + m;
+    x0r = a[j0 - 2] + a[j2 - 2];
+    x0i = a[j0 - 1] + a[j2 - 1];
+    x1r = a[j0 - 2] - a[j2 - 2];
+    x1i = a[j0 - 1] - a[j2 - 1];
+    x2r = a[j1 - 2] + a[j3 - 2];
+    x2i = a[j1 - 1] + a[j3 - 1];
+    x3r = a[j1 - 2] - a[j3 - 2];
+    x3i = a[j1 - 1] - a[j3 - 1];
+    a[j0 - 2] = x0r + x2r;
+    a[j0 - 1] = x0i + x2i;
+    a[j1 - 2] = x0r - x2r;
+    a[j1 - 1] = x0i - x2i;
+    x0r = x1r - x3i;
+    x0i = x1i + x3r;
+    a[j2 - 2] = wk1r * x0r - wk1i * x0i;
+    a[j2 - 1] = wk1r * x0i + wk1i * x0r;
+    x0r = x1r + x3i;
+    x0i = x1i - x3r;
+    a[j3 - 2] = wk3r * x0r + wk3i * x0i;
+    a[j3 - 1] = wk3r * x0i - wk3i * x0r;
+    x0r = a[j0] + a[j2];
+    x0i = a[j0 + 1] + a[j2 + 1];
+    x1r = a[j0] - a[j2];
+    x1i = a[j0 + 1] - a[j2 + 1];
+    x2r = a[j1] + a[j3];
+    x2i = a[j1 + 1] + a[j3 + 1];
+    x3r = a[j1] - a[j3];
+    x3i = a[j1 + 1] - a[j3 + 1];
+    a[j0] = x0r + x2r;
+    a[j0 + 1] = x0i + x2i;
+    a[j1] = x0r - x2r;
+    a[j1 + 1] = x0i - x2i;
+    x0r = x1r - x3i;
+    x0i = x1i + x3r;
+    a[j2] = wn4r * (x0r - x0i);
+    a[j2 + 1] = wn4r * (x0i + x0r);
+    x0r = x1r + x3i;
+    x0i = x1i - x3r;
+    a[j3] = -wn4r * (x0r + x0i);
+    a[j3 + 1] = -wn4r * (x0i - x0r);
+    x0r = a[j0 + 2] + a[j2 + 2];
+    x0i = a[j0 + 3] + a[j2 + 3];
+    x1r = a[j0 + 2] - a[j2 + 2];
+    x1i = a[j0 + 3] - a[j2 + 3];
+    x2r = a[j1 + 2] + a[j3 + 2];
+    x2i = a[j1 + 3] + a[j3 + 3];
+    x3r = a[j1 + 2] - a[j3 + 2];
+    x3i = a[j1 + 3] - a[j3 + 3];
+    a[j0 + 2] = x0r + x2r;
+    a[j0 + 3] = x0i + x2i;
+    a[j1 + 2] = x0r - x2r;
+    a[j1 + 3] = x0i - x2i;
+    x0r = x1r - x3i;
+    x0i = x1i + x3r;
+    a[j2 + 2] = wk1i * x0r - wk1r * x0i;
+    a[j2 + 3] = wk1i * x0i + wk1r * x0r;
+    x0r = x1r + x3i;
+    x0i = x1i - x3r;
+    a[j3 + 2] = wk3i * x0r + wk3r * x0i;
+    a[j3 + 3] = wk3i * x0i - wk3r * x0r;
+}
+
+
+void cftb1st(int n, double *a, double *w)
+{
+    int j, j0, j1, j2, j3, k, m, mh;
+    double wn4r, csc1, csc3, wk1r, wk1i, wk3r, wk3i,
+        wd1r, wd1i, wd3r, wd3i;
+    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i,
+        y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i;
+
+    mh = n >> 3;
+    m = 2 * mh;
+    j1 = m;
+    j2 = j1 + m;
+    j3 = j2 + m;
+    x0r = a[0] + a[j2];
+    x0i = -a[1] - a[j2 + 1];
+    x1r = a[0] - a[j2];
+    x1i = -a[1] + a[j2 + 1];
+    x2r = a[j1] + a[j3];
+    x2i = a[j1 + 1] + a[j3 + 1];
+    x3r = a[j1] - a[j3];
+    x3i = a[j1 + 1] - a[j3 + 1];
+    a[0] = x0r + x2r;
+    a[1] = x0i - x2i;
+    a[j1] = x0r - x2r;
+    a[j1 + 1] = x0i + x2i;
+    a[j2] = x1r + x3i;
+    a[j2 + 1] = x1i + x3r;
+    a[j3] = x1r - x3i;
+    a[j3 + 1] = x1i - x3r;
+    wn4r = w[1];
+    csc1 = w[2];
+    csc3 = w[3];
+    wd1r = 1;
+    wd1i = 0;
+    wd3r = 1;
+    wd3i = 0;
+    k = 0;
+    for (j = 2; j < mh - 2; j += 4) {
+        k += 4;
+        wk1r = csc1 * (wd1r + w[k]);
+        wk1i = csc1 * (wd1i + w[k + 1]);
+        wk3r = csc3 * (wd3r + w[k + 2]);
+        wk3i = csc3 * (wd3i + w[k + 3]);
+        wd1r = w[k];
+        wd1i = w[k + 1];
+        wd3r = w[k + 2];
+        wd3i = w[k + 3];
+        j1 = j + m;
+        j2 = j1 + m;
+        j3 = j2 + m;
+        x0r = a[j] + a[j2];
+        x0i = -a[j + 1] - a[j2 + 1];
+        x1r = a[j] - a[j2];
+        x1i = -a[j + 1] + a[j2 + 1];
+        y0r = a[j + 2] + a[j2 + 2];
+        y0i = -a[j + 3] - a[j2 + 3];
+        y1r = a[j + 2] - a[j2 + 2];
+        y1i = -a[j + 3] + a[j2 + 3];
+        x2r = a[j1] + a[j3];
+        x2i = a[j1 + 1] + a[j3 + 1];
+        x3r = a[j1] - a[j3];
+        x3i = a[j1 + 1] - a[j3 + 1];
+        y2r = a[j1 + 2] + a[j3 + 2];
+        y2i = a[j1 + 3] + a[j3 + 3];
+        y3r = a[j1 + 2] - a[j3 + 2];
+        y3i = a[j1 + 3] - a[j3 + 3];
+        a[j] = x0r + x2r;
+        a[j + 1] = x0i - x2i;
+        a[j + 2] = y0r + y2r;
+        a[j + 3] = y0i - y2i;
+        a[j1] = x0r - x2r;
+        a[j1 + 1] = x0i + x2i;
+        a[j1 + 2] = y0r - y2r;
+        a[j1 + 3] = y0i + y2i;
+        x0r = x1r + x3i;
+        x0i = x1i + x3r;
+        a[j2] = wk1r * x0r - wk1i * x0i;
+        a[j2 + 1] = wk1r * x0i + wk1i * x0r;
+        x0r = y1r + y3i;
+        x0i = y1i + y3r;
+        a[j2 + 2] = wd1r * x0r - wd1i * x0i;
+        a[j2 + 3] = wd1r * x0i + wd1i * x0r;
+        x0r = x1r - x3i;
+        x0i = x1i - x3r;
+        a[j3] = wk3r * x0r + wk3i * x0i;
+        a[j3 + 1] = wk3r * x0i - wk3i * x0r;
+        x0r = y1r - y3i;
+        x0i = y1i - y3r;
+        a[j3 + 2] = wd3r * x0r + wd3i * x0i;
+        a[j3 + 3] = wd3r * x0i - wd3i * x0r;
+        j0 = m - j;
+        j1 = j0 + m;
+        j2 = j1 + m;
+        j3 = j2 + m;
+        x0r = a[j0] + a[j2];
+        x0i = -a[j0 + 1] - a[j2 + 1];
+        x1r = a[j0] - a[j2];
+        x1i = -a[j0 + 1] + a[j2 + 1];
+        y0r = a[j0 - 2] + a[j2 - 2];
+        y0i = -a[j0 - 1] - a[j2 - 1];
+        y1r = a[j0 - 2] - a[j2 - 2];
+        y1i = -a[j0 - 1] + a[j2 - 1];
+        x2r = a[j1] + a[j3];
+        x2i = a[j1 + 1] + a[j3 + 1];
+        x3r = a[j1] - a[j3];
+        x3i = a[j1 + 1] - a[j3 + 1];
+        y2r = a[j1 - 2] + a[j3 - 2];
+        y2i = a[j1 - 1] + a[j3 - 1];
+        y3r = a[j1 - 2] - a[j3 - 2];
+        y3i = a[j1 - 1] - a[j3 - 1];
+        a[j0] = x0r + x2r;
+        a[j0 + 1] = x0i - x2i;
+        a[j0 - 2] = y0r + y2r;
+        a[j0 - 1] = y0i - y2i;
+        a[j1] = x0r - x2r;
+        a[j1 + 1] = x0i + x2i;
+        a[j1 - 2] = y0r - y2r;
+        a[j1 - 1] = y0i + y2i;
+        x0r = x1r + x3i;
+        x0i = x1i + x3r;
+        a[j2] = wk1i * x0r - wk1r * x0i;
+        a[j2 + 1] = wk1i * x0i + wk1r * x0r;
+        x0r = y1r + y3i;
+        x0i = y1i + y3r;
+        a[j2 - 2] = wd1i * x0r - wd1r * x0i;
+        a[j2 - 1] = wd1i * x0i + wd1r * x0r;
+        x0r = x1r - x3i;
+        x0i = x1i - x3r;
+        a[j3] = wk3i * x0r + wk3r * x0i;
+        a[j3 + 1] = wk3i * x0i - wk3r * x0r;
+        x0r = y1r - y3i;
+        x0i = y1i - y3r;
+        a[j3 - 2] = wd3i * x0r + wd3r * x0i;
+        a[j3 - 1] = wd3i * x0i - wd3r * x0r;
+    }
+    wk1r = csc1 * (wd1r + wn4r);
+    wk1i = csc1 * (wd1i + wn4r);
+    wk3r = csc3 * (wd3r - wn4r);
+    wk3i = csc3 * (wd3i - wn4r);
+    j0 = mh;
+    j1 = j0 + m;
+    j2 = j1 + m;
+    j3 = j2 + m;
+    x0r = a[j0 - 2] + a[j2 - 2];
+    x0i = -a[j0 - 1] - a[j2 - 1];
+    x1r = a[j0 - 2] - a[j2 - 2];
+    x1i = -a[j0 - 1] + a[j2 - 1];
+    x2r = a[j1 - 2] + a[j3 - 2];
+    x2i = a[j1 - 1] + a[j3 - 1];
+    x3r = a[j1 - 2] - a[j3 - 2];
+    x3i = a[j1 - 1] - a[j3 - 1];
+    a[j0 - 2] = x0r + x2r;
+    a[j0 - 1] = x0i - x2i;
+    a[j1 - 2] = x0r - x2r;
+    a[j1 - 1] = x0i + x2i;
+    x0r = x1r + x3i;
+    x0i = x1i + x3r;
+    a[j2 - 2] = wk1r * x0r - wk1i * x0i;
+    a[j2 - 1] = wk1r * x0i + wk1i * x0r;
+    x0r = x1r - x3i;
+    x0i = x1i - x3r;
+    a[j3 - 2] = wk3r * x0r + wk3i * x0i;
+    a[j3 - 1] = wk3r * x0i - wk3i * x0r;
+    x0r = a[j0] + a[j2];
+    x0i = -a[j0 + 1] - a[j2 + 1];
+    x1r = a[j0] - a[j2];
+    x1i = -a[j0 + 1] + a[j2 + 1];
+    x2r = a[j1] + a[j3];
+    x2i = a[j1 + 1] + a[j3 + 1];
+    x3r = a[j1] - a[j3];
+    x3i = a[j1 + 1] - a[j3 + 1];
+    a[j0] = x0r + x2r;
+    a[j0 + 1] = x0i - x2i;
+    a[j1] = x0r - x2r;
+    a[j1 + 1] = x0i + x2i;
+    x0r = x1r + x3i;
+    x0i = x1i + x3r;
+    a[j2] = wn4r * (x0r - x0i);
+    a[j2 + 1] = wn4r * (x0i + x0r);
+    x0r = x1r - x3i;
+    x0i = x1i - x3r;
+    a[j3] = -wn4r * (x0r + x0i);
+    a[j3 + 1] = -wn4r * (x0i - x0r);
+    x0r = a[j0 + 2] + a[j2 + 2];
+    x0i = -a[j0 + 3] - a[j2 + 3];
+    x1r = a[j0 + 2] - a[j2 + 2];
+    x1i = -a[j0 + 3] + a[j2 + 3];
+    x2r = a[j1 + 2] + a[j3 + 2];
+    x2i = a[j1 + 3] + a[j3 + 3];
+    x3r = a[j1 + 2] - a[j3 + 2];
+    x3i = a[j1 + 3] - a[j3 + 3];
+    a[j0 + 2] = x0r + x2r;
+    a[j0 + 3] = x0i - x2i;
+    a[j1 + 2] = x0r - x2r;
+    a[j1 + 3] = x0i + x2i;
+    x0r = x1r + x3i;
+    x0i = x1i + x3r;
+    a[j2 + 2] = wk1i * x0r - wk1r * x0i;
+    a[j2 + 3] = wk1i * x0i + wk1r * x0r;
+    x0r = x1r - x3i;
+    x0i = x1i - x3r;
+    a[j3 + 2] = wk3i * x0r + wk3r * x0i;
+    a[j3 + 3] = wk3i * x0i - wk3r * x0r;
+}
+
+
+#ifdef USE_CDFT_THREADS
+struct cdft_arg_st {
+    int n0;
+    int n;
+    double *a;
+    int nw;
+    double *w;
+};
+typedef struct cdft_arg_st cdft_arg_t;
+
+
+void cftrec4_th(int n, double *a, int nw, double *w)
+{
+    void *cftrec1_th(void *p);
+    void *cftrec2_th(void *p);
+    int i, idiv4, m, nthread;
+    cdft_thread_t th[4];
+    cdft_arg_t ag[4];
+
+    nthread = 2;
+    idiv4 = 0;
+    m = n >> 1;
+    if (n > CDFT_4THREADS_BEGIN_N) {
+        nthread = 4;
+        idiv4 = 1;
+        m >>= 1;
+    }
+    for (i = 0; i < nthread; i++) {
+        ag[i].n0 = n;
+        ag[i].n = m;
+        ag[i].a = &a[i * m];
+        ag[i].nw = nw;
+        ag[i].w = w;
+        if (i != idiv4) {
+            cdft_thread_create(&th[i], cftrec1_th, &ag[i]);
+        } else {
+            cdft_thread_create(&th[i], cftrec2_th, &ag[i]);
+        }
+    }
+    for (i = 0; i < nthread; i++) {
+        cdft_thread_wait(th[i]);
+    }
+}
+
+
+void *cftrec1_th(void *p)
+{
+    int cfttree(int n, int j, int k, double *a, int nw, double *w);
+    void cftleaf(int n, int isplt, double *a, int nw, double *w);
+    void cftmdl1(int n, double *a, double *w);
+    int isplt, j, k, m, n, n0, nw;
+    double *a, *w;
+
+    n0 = ((cdft_arg_t *) p)->n0;
+    n = ((cdft_arg_t *) p)->n;
+    a = ((cdft_arg_t *) p)->a;
+    nw = ((cdft_arg_t *) p)->nw;
+    w = ((cdft_arg_t *) p)->w;
+    m = n0;
+    while (m > 512) {
+        m >>= 2;
+        cftmdl1(m, &a[n - m], &w[nw - (m >> 1)]);
+    }
+    cftleaf(m, 1, &a[n - m], nw, w);
+    k = 0;
+    for (j = n - m; j > 0; j -= m) {
+        k++;
+        isplt = cfttree(m, j, k, a, nw, w);
+        cftleaf(m, isplt, &a[j - m], nw, w);
+    }
+    return (void *) 0;
+}
+
+
+void *cftrec2_th(void *p)
+{
+    int cfttree(int n, int j, int k, double *a, int nw, double *w);
+    void cftleaf(int n, int isplt, double *a, int nw, double *w);
+    void cftmdl2(int n, double *a, double *w);
+    int isplt, j, k, m, n, n0, nw;
+    double *a, *w;
+
+    n0 = ((cdft_arg_t *) p)->n0;
+    n = ((cdft_arg_t *) p)->n;
+    a = ((cdft_arg_t *) p)->a;
+    nw = ((cdft_arg_t *) p)->nw;
+    w = ((cdft_arg_t *) p)->w;
+    k = 1;
+    m = n0;
+    while (m > 512) {
+        m >>= 2;
+        k <<= 2;
+        cftmdl2(m, &a[n - m], &w[nw - m]);
+    }
+    cftleaf(m, 0, &a[n - m], nw, w);
+    k >>= 1;
+    for (j = n - m; j > 0; j -= m) {
+        k++;
+        isplt = cfttree(m, j, k, a, nw, w);
+        cftleaf(m, isplt, &a[j - m], nw, w);
+    }
+    return (void *) 0;
+}
+#endif /* USE_CDFT_THREADS */
+
+
+void cftrec4(int n, double *a, int nw, double *w)
+{
+    int cfttree(int n, int j, int k, double *a, int nw, double *w);
+    void cftleaf(int n, int isplt, double *a, int nw, double *w);
+    void cftmdl1(int n, double *a, double *w);
+    int isplt, j, k, m;
+
+    m = n;
+    while (m > 512) {
+        m >>= 2;
+        cftmdl1(m, &a[n - m], &w[nw - (m >> 1)]);
+    }
+    cftleaf(m, 1, &a[n - m], nw, w);
+    k = 0;
+    for (j = n - m; j > 0; j -= m) {
+        k++;
+        isplt = cfttree(m, j, k, a, nw, w);
+        cftleaf(m, isplt, &a[j - m], nw, w);
+    }
+}
+
+
+int cfttree(int n, int j, int k, double *a, int nw, double *w)
+{
+    void cftmdl1(int n, double *a, double *w);
+    void cftmdl2(int n, double *a, double *w);
+    int i, isplt, m;
+
+    if ((k & 3) != 0) {
+        isplt = k & 1;
+        if (isplt != 0) {
+            cftmdl1(n, &a[j - n], &w[nw - (n >> 1)]);
+        } else {
+            cftmdl2(n, &a[j - n], &w[nw - n]);
+        }
+    } else {
+        m = n;
+        for (i = k; (i & 3) == 0; i >>= 2) {
+            m <<= 2;
+        }
+        isplt = i & 1;
+        if (isplt != 0) {
+            while (m > 128) {
+                cftmdl1(m, &a[j - m], &w[nw - (m >> 1)]);
+                m >>= 2;
+            }
+        } else {
+            while (m > 128) {
+                cftmdl2(m, &a[j - m], &w[nw - m]);
+                m >>= 2;
+            }
+        }
+    }
+    return isplt;
+}
+
+
+void cftleaf(int n, int isplt, double *a, int nw, double *w)
+{
+    void cftmdl1(int n, double *a, double *w);
+    void cftmdl2(int n, double *a, double *w);
+    void cftf161(double *a, double *w);
+    void cftf162(double *a, double *w);
+    void cftf081(double *a, double *w);
+    void cftf082(double *a, double *w);
+
+    if (n == 512) {
+        cftmdl1(128, a, &w[nw - 64]);
+        cftf161(a, &w[nw - 8]);
+        cftf162(&a[32], &w[nw - 32]);
+        cftf161(&a[64], &w[nw - 8]);
+        cftf161(&a[96], &w[nw - 8]);
+        cftmdl2(128, &a[128], &w[nw - 128]);
+        cftf161(&a[128], &w[nw - 8]);
+        cftf162(&a[160], &w[nw - 32]);
+        cftf161(&a[192], &w[nw - 8]);
+        cftf162(&a[224], &w[nw - 32]);
+        cftmdl1(128, &a[256], &w[nw - 64]);
+        cftf161(&a[256], &w[nw - 8]);
+        cftf162(&a[288], &w[nw - 32]);
+        cftf161(&a[320], &w[nw - 8]);
+        cftf161(&a[352], &w[nw - 8]);
+        if (isplt != 0) {
+            cftmdl1(128, &a[384], &w[nw - 64]);
+            cftf161(&a[480], &w[nw - 8]);
+        } else {
+            cftmdl2(128, &a[384], &w[nw - 128]);
+            cftf162(&a[480], &w[nw - 32]);
+        }
+        cftf161(&a[384], &w[nw - 8]);
+        cftf162(&a[416], &w[nw - 32]);
+        cftf161(&a[448], &w[nw - 8]);
+    } else {
+        cftmdl1(64, a, &w[nw - 32]);
+        cftf081(a, &w[nw - 8]);
+        cftf082(&a[16], &w[nw - 8]);
+        cftf081(&a[32], &w[nw - 8]);
+        cftf081(&a[48], &w[nw - 8]);
+        cftmdl2(64, &a[64], &w[nw - 64]);
+        cftf081(&a[64], &w[nw - 8]);
+        cftf082(&a[80], &w[nw - 8]);
+        cftf081(&a[96], &w[nw - 8]);
+        cftf082(&a[112], &w[nw - 8]);
+        cftmdl1(64, &a[128], &w[nw - 32]);
+        cftf081(&a[128], &w[nw - 8]);
+        cftf082(&a[144], &w[nw - 8]);
+        cftf081(&a[160], &w[nw - 8]);
+        cftf081(&a[176], &w[nw - 8]);
+        if (isplt != 0) {
+            cftmdl1(64, &a[192], &w[nw - 32]);
+            cftf081(&a[240], &w[nw - 8]);
+        } else {
+            cftmdl2(64, &a[192], &w[nw - 64]);
+            cftf082(&a[240], &w[nw - 8]);
+        }
+        cftf081(&a[192], &w[nw - 8]);
+        cftf082(&a[208], &w[nw - 8]);
+        cftf081(&a[224], &w[nw - 8]);
+    }
+}
+
+
+void cftmdl1(int n, double *a, double *w)
+{
+    int j, j0, j1, j2, j3, k, m, mh;
+    double wn4r, wk1r, wk1i, wk3r, wk3i;
+    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+
+    mh = n >> 3;
+    m = 2 * mh;
+    j1 = m;
+    j2 = j1 + m;
+    j3 = j2 + m;
+    x0r = a[0] + a[j2];
+    x0i = a[1] + a[j2 + 1];
+    x1r = a[0] - a[j2];
+    x1i = a[1] - a[j2 + 1];
+    x2r = a[j1] + a[j3];
+    x2i = a[j1 + 1] + a[j3 + 1];
+    x3r = a[j1] - a[j3];
+    x3i = a[j1 + 1] - a[j3 + 1];
+    a[0] = x0r + x2r;
+    a[1] = x0i + x2i;
+    a[j1] = x0r - x2r;
+    a[j1 + 1] = x0i - x2i;
+    a[j2] = x1r - x3i;
+    a[j2 + 1] = x1i + x3r;
+    a[j3] = x1r + x3i;
+    a[j3 + 1] = x1i - x3r;
+    wn4r = w[1];
+    k = 0;
+    for (j = 2; j < mh; j += 2) {
+        k += 4;
+        wk1r = w[k];
+        wk1i = w[k + 1];
+        wk3r = w[k + 2];
+        wk3i = w[k + 3];
+        j1 = j + m;
+        j2 = j1 + m;
+        j3 = j2 + m;
+        x0r = a[j] + a[j2];
+        x0i = a[j + 1] + a[j2 + 1];
+        x1r = a[j] - a[j2];
+        x1i = a[j + 1] - a[j2 + 1];
+        x2r = a[j1] + a[j3];
+        x2i = a[j1 + 1] + a[j3 + 1];
+        x3r = a[j1] - a[j3];
+        x3i = a[j1 + 1] - a[j3 + 1];
+        a[j] = x0r + x2r;
+        a[j + 1] = x0i + x2i;
+        a[j1] = x0r - x2r;
+        a[j1 + 1] = x0i - x2i;
+        x0r = x1r - x3i;
+        x0i = x1i + x3r;
+        a[j2] = wk1r * x0r - wk1i * x0i;
+        a[j2 + 1] = wk1r * x0i + wk1i * x0r;
+        x0r = x1r + x3i;
+        x0i = x1i - x3r;
+        a[j3] = wk3r * x0r + wk3i * x0i;
+        a[j3 + 1] = wk3r * x0i - wk3i * x0r;
+        j0 = m - j;
+        j1 = j0 + m;
+        j2 = j1 + m;
+        j3 = j2 + m;
+        x0r = a[j0] + a[j2];
+        x0i = a[j0 + 1] + a[j2 + 1];
+        x1r = a[j0] - a[j2];
+        x1i = a[j0 + 1] - a[j2 + 1];
+        x2r = a[j1] + a[j3];
+        x2i = a[j1 + 1] + a[j3 + 1];
+        x3r = a[j1] - a[j3];
+        x3i = a[j1 + 1] - a[j3 + 1];
+        a[j0] = x0r + x2r;
+        a[j0 + 1] = x0i + x2i;
+        a[j1] = x0r - x2r;
+        a[j1 + 1] = x0i - x2i;
+        x0r = x1r - x3i;
+        x0i = x1i + x3r;
+        a[j2] = wk1i * x0r - wk1r * x0i;
+        a[j2 + 1] = wk1i * x0i + wk1r * x0r;
+        x0r = x1r + x3i;
+        x0i = x1i - x3r;
+        a[j3] = wk3i * x0r + wk3r * x0i;
+        a[j3 + 1] = wk3i * x0i - wk3r * x0r;
+    }
+    j0 = mh;
+    j1 = j0 + m;
+    j2 = j1 + m;
+    j3 = j2 + m;
+    x0r = a[j0] + a[j2];
+    x0i = a[j0 + 1] + a[j2 + 1];
+    x1r = a[j0] - a[j2];
+    x1i = a[j0 + 1] - a[j2 + 1];
+    x2r = a[j1] + a[j3];
+    x2i = a[j1 + 1] + a[j3 + 1];
+    x3r = a[j1] - a[j3];
+    x3i = a[j1 + 1] - a[j3 + 1];
+    a[j0] = x0r + x2r;
+    a[j0 + 1] = x0i + x2i;
+    a[j1] = x0r - x2r;
+    a[j1 + 1] = x0i - x2i;
+    x0r = x1r - x3i;
+    x0i = x1i + x3r;
+    a[j2] = wn4r * (x0r - x0i);
+    a[j2 + 1] = wn4r * (x0i + x0r);
+    x0r = x1r + x3i;
+    x0i = x1i - x3r;
+    a[j3] = -wn4r * (x0r + x0i);
+    a[j3 + 1] = -wn4r * (x0i - x0r);
+}
+
+
+void cftmdl2(int n, double *a, double *w)
+{
+    int j, j0, j1, j2, j3, k, kr, m, mh;
+    double wn4r, wk1r, wk1i, wk3r, wk3i, wd1r, wd1i, wd3r, wd3i;
+    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i, y0r, y0i, y2r, y2i;
+
+    mh = n >> 3;
+    m = 2 * mh;
+    wn4r = w[1];
+    j1 = m;
+    j2 = j1 + m;
+    j3 = j2 + m;
+    x0r = a[0] - a[j2 + 1];
+    x0i = a[1] + a[j2];
+    x1r = a[0] + a[j2 + 1];
+    x1i = a[1] - a[j2];
+    x2r = a[j1] - a[j3 + 1];
+    x2i = a[j1 + 1] + a[j3];
+    x3r = a[j1] + a[j3 + 1];
+    x3i = a[j1 + 1] - a[j3];
+    y0r = wn4r * (x2r - x2i);
+    y0i = wn4r * (x2i + x2r);
+    a[0] = x0r + y0r;
+    a[1] = x0i + y0i;
+    a[j1] = x0r - y0r;
+    a[j1 + 1] = x0i - y0i;
+    y0r = wn4r * (x3r - x3i);
+    y0i = wn4r * (x3i + x3r);
+    a[j2] = x1r - y0i;
+    a[j2 + 1] = x1i + y0r;
+    a[j3] = x1r + y0i;
+    a[j3 + 1] = x1i - y0r;
+    k = 0;
+    kr = 2 * m;
+    for (j = 2; j < mh; j += 2) {
+        k += 4;
+        wk1r = w[k];
+        wk1i = w[k + 1];
+        wk3r = w[k + 2];
+        wk3i = w[k + 3];
+        kr -= 4;
+        wd1i = w[kr];
+        wd1r = w[kr + 1];
+        wd3i = w[kr + 2];
+        wd3r = w[kr + 3];
+        j1 = j + m;
+        j2 = j1 + m;
+        j3 = j2 + m;
+        x0r = a[j] - a[j2 + 1];
+        x0i = a[j + 1] + a[j2];
+        x1r = a[j] + a[j2 + 1];
+        x1i = a[j + 1] - a[j2];
+        x2r = a[j1] - a[j3 + 1];
+        x2i = a[j1 + 1] + a[j3];
+        x3r = a[j1] + a[j3 + 1];
+        x3i = a[j1 + 1] - a[j3];
+        y0r = wk1r * x0r - wk1i * x0i;
+        y0i = wk1r * x0i + wk1i * x0r;
+        y2r = wd1r * x2r - wd1i * x2i;
+        y2i = wd1r * x2i + wd1i * x2r;
+        a[j] = y0r + y2r;
+        a[j + 1] = y0i + y2i;
+        a[j1] = y0r - y2r;
+        a[j1 + 1] = y0i - y2i;
+        y0r = wk3r * x1r + wk3i * x1i;
+        y0i = wk3r * x1i - wk3i * x1r;
+        y2r = wd3r * x3r + wd3i * x3i;
+        y2i = wd3r * x3i - wd3i * x3r;
+        a[j2] = y0r + y2r;
+        a[j2 + 1] = y0i + y2i;
+        a[j3] = y0r - y2r;
+        a[j3 + 1] = y0i - y2i;
+        j0 = m - j;
+        j1 = j0 + m;
+        j2 = j1 + m;
+        j3 = j2 + m;
+        x0r = a[j0] - a[j2 + 1];
+        x0i = a[j0 + 1] + a[j2];
+        x1r = a[j0] + a[j2 + 1];
+        x1i = a[j0 + 1] - a[j2];
+        x2r = a[j1] - a[j3 + 1];
+        x2i = a[j1 + 1] + a[j3];
+        x3r = a[j1] + a[j3 + 1];
+        x3i = a[j1 + 1] - a[j3];
+        y0r = wd1i * x0r - wd1r * x0i;
+        y0i = wd1i * x0i + wd1r * x0r;
+        y2r = wk1i * x2r - wk1r * x2i;
+        y2i = wk1i * x2i + wk1r * x2r;
+        a[j0] = y0r + y2r;
+        a[j0 + 1] = y0i + y2i;
+        a[j1] = y0r - y2r;
+        a[j1 + 1] = y0i - y2i;
+        y0r = wd3i * x1r + wd3r * x1i;
+        y0i = wd3i * x1i - wd3r * x1r;
+        y2r = wk3i * x3r + wk3r * x3i;
+        y2i = wk3i * x3i - wk3r * x3r;
+        a[j2] = y0r + y2r;
+        a[j2 + 1] = y0i + y2i;
+        a[j3] = y0r - y2r;
+        a[j3 + 1] = y0i - y2i;
+    }
+    wk1r = w[m];
+    wk1i = w[m + 1];
+    j0 = mh;
+    j1 = j0 + m;
+    j2 = j1 + m;
+    j3 = j2 + m;
+    x0r = a[j0] - a[j2 + 1];
+    x0i = a[j0 + 1] + a[j2];
+    x1r = a[j0] + a[j2 + 1];
+    x1i = a[j0 + 1] - a[j2];
+    x2r = a[j1] - a[j3 + 1];
+    x2i = a[j1 + 1] + a[j3];
+    x3r = a[j1] + a[j3 + 1];
+    x3i = a[j1 + 1] - a[j3];
+    y0r = wk1r * x0r - wk1i * x0i;
+    y0i = wk1r * x0i + wk1i * x0r;
+    y2r = wk1i * x2r - wk1r * x2i;
+    y2i = wk1i * x2i + wk1r * x2r;
+    a[j0] = y0r + y2r;
+    a[j0 + 1] = y0i + y2i;
+    a[j1] = y0r - y2r;
+    a[j1 + 1] = y0i - y2i;
+    y0r = wk1i * x1r - wk1r * x1i;
+    y0i = wk1i * x1i + wk1r * x1r;
+    y2r = wk1r * x3r - wk1i * x3i;
+    y2i = wk1r * x3i + wk1i * x3r;
+    a[j2] = y0r - y2r;
+    a[j2 + 1] = y0i - y2i;
+    a[j3] = y0r + y2r;
+    a[j3 + 1] = y0i + y2i;
+}
+
+
+void cftfx41(int n, double *a, int nw, double *w)
+{
+    void cftf161(double *a, double *w);
+    void cftf162(double *a, double *w);
+    void cftf081(double *a, double *w);
+    void cftf082(double *a, double *w);
+
+    if (n == 128) {
+        cftf161(a, &w[nw - 8]);
+        cftf162(&a[32], &w[nw - 32]);
+        cftf161(&a[64], &w[nw - 8]);
+        cftf161(&a[96], &w[nw - 8]);
+    } else {
+        cftf081(a, &w[nw - 8]);
+        cftf082(&a[16], &w[nw - 8]);
+        cftf081(&a[32], &w[nw - 8]);
+        cftf081(&a[48], &w[nw - 8]);
+    }
+}
+
+
+void cftf161(double *a, double *w)
+{
+    double wn4r, wk1r, wk1i,
+        x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i,
+        y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i,
+        y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i,
+        y8r, y8i, y9r, y9i, y10r, y10i, y11r, y11i,
+        y12r, y12i, y13r, y13i, y14r, y14i, y15r, y15i;
+
+    wn4r = w[1];
+    wk1r = w[2];
+    wk1i = w[3];
+    x0r = a[0] + a[16];
+    x0i = a[1] + a[17];
+    x1r = a[0] - a[16];
+    x1i = a[1] - a[17];
+    x2r = a[8] + a[24];
+    x2i = a[9] + a[25];
+    x3r = a[8] - a[24];
+    x3i = a[9] - a[25];
+    y0r = x0r + x2r;
+    y0i = x0i + x2i;
+    y4r = x0r - x2r;
+    y4i = x0i - x2i;
+    y8r = x1r - x3i;
+    y8i = x1i + x3r;
+    y12r = x1r + x3i;
+    y12i = x1i - x3r;
+    x0r = a[2] + a[18];
+    x0i = a[3] + a[19];
+    x1r = a[2] - a[18];
+    x1i = a[3] - a[19];
+    x2r = a[10] + a[26];
+    x2i = a[11] + a[27];
+    x3r = a[10] - a[26];
+    x3i = a[11] - a[27];
+    y1r = x0r + x2r;
+    y1i = x0i + x2i;
+    y5r = x0r - x2r;
+    y5i = x0i - x2i;
+    x0r = x1r - x3i;
+    x0i = x1i + x3r;
+    y9r = wk1r * x0r - wk1i * x0i;
+    y9i = wk1r * x0i + wk1i * x0r;
+    x0r = x1r + x3i;
+    x0i = x1i - x3r;
+    y13r = wk1i * x0r - wk1r * x0i;
+    y13i = wk1i * x0i + wk1r * x0r;
+    x0r = a[4] + a[20];
+    x0i = a[5] + a[21];
+    x1r = a[4] - a[20];
+    x1i = a[5] - a[21];
+    x2r = a[12] + a[28];
+    x2i = a[13] + a[29];
+    x3r = a[12] - a[28];
+    x3i = a[13] - a[29];
+    y2r = x0r + x2r;
+    y2i = x0i + x2i;
+    y6r = x0r - x2r;
+    y6i = x0i - x2i;
+    x0r = x1r - x3i;
+    x0i = x1i + x3r;
+    y10r = wn4r * (x0r - x0i);
+    y10i = wn4r * (x0i + x0r);
+    x0r = x1r + x3i;
+    x0i = x1i - x3r;
+    y14r = wn4r * (x0r + x0i);
+    y14i = wn4r * (x0i - x0r);
+    x0r = a[6] + a[22];
+    x0i = a[7] + a[23];
+    x1r = a[6] - a[22];
+    x1i = a[7] - a[23];
+    x2r = a[14] + a[30];
+    x2i = a[15] + a[31];
+    x3r = a[14] - a[30];
+    x3i = a[15] - a[31];
+    y3r = x0r + x2r;
+    y3i = x0i + x2i;
+    y7r = x0r - x2r;
+    y7i = x0i - x2i;
+    x0r = x1r - x3i;
+    x0i = x1i + x3r;
+    y11r = wk1i * x0r - wk1r * x0i;
+    y11i = wk1i * x0i + wk1r * x0r;
+    x0r = x1r + x3i;
+    x0i = x1i - x3r;
+    y15r = wk1r * x0r - wk1i * x0i;
+    y15i = wk1r * x0i + wk1i * x0r;
+    x0r = y12r - y14r;
+    x0i = y12i - y14i;
+    x1r = y12r + y14r;
+    x1i = y12i + y14i;
+    x2r = y13r - y15r;
+    x2i = y13i - y15i;
+    x3r = y13r + y15r;
+    x3i = y13i + y15i;
+    a[24] = x0r + x2r;
+    a[25] = x0i + x2i;
+    a[26] = x0r - x2r;
+    a[27] = x0i - x2i;
+    a[28] = x1r - x3i;
+    a[29] = x1i + x3r;
+    a[30] = x1r + x3i;
+    a[31] = x1i - x3r;
+    x0r = y8r + y10r;
+    x0i = y8i + y10i;
+    x1r = y8r - y10r;
+    x1i = y8i - y10i;
+    x2r = y9r + y11r;
+    x2i = y9i + y11i;
+    x3r = y9r - y11r;
+    x3i = y9i - y11i;
+    a[16] = x0r + x2r;
+    a[17] = x0i + x2i;
+    a[18] = x0r - x2r;
+    a[19] = x0i - x2i;
+    a[20] = x1r - x3i;
+    a[21] = x1i + x3r;
+    a[22] = x1r + x3i;
+    a[23] = x1i - x3r;
+    x0r = y5r - y7i;
+    x0i = y5i + y7r;
+    x2r = wn4r * (x0r - x0i);
+    x2i = wn4r * (x0i + x0r);
+    x0r = y5r + y7i;
+    x0i = y5i - y7r;
+    x3r = wn4r * (x0r - x0i);
+    x3i = wn4r * (x0i + x0r);
+    x0r = y4r - y6i;
+    x0i = y4i + y6r;
+    x1r = y4r + y6i;
+    x1i = y4i - y6r;
+    a[8] = x0r + x2r;
+    a[9] = x0i + x2i;
+    a[10] = x0r - x2r;
+    a[11] = x0i - x2i;
+    a[12] = x1r - x3i;
+    a[13] = x1i + x3r;
+    a[14] = x1r + x3i;
+    a[15] = x1i - x3r;
+    x0r = y0r + y2r;
+    x0i = y0i + y2i;
+    x1r = y0r - y2r;
+    x1i = y0i - y2i;
+    x2r = y1r + y3r;
+    x2i = y1i + y3i;
+    x3r = y1r - y3r;
+    x3i = y1i - y3i;
+    a[0] = x0r + x2r;
+    a[1] = x0i + x2i;
+    a[2] = x0r - x2r;
+    a[3] = x0i - x2i;
+    a[4] = x1r - x3i;
+    a[5] = x1i + x3r;
+    a[6] = x1r + x3i;
+    a[7] = x1i - x3r;
+}
+
+
+void cftf162(double *a, double *w)
+{
+    double wn4r, wk1r, wk1i, wk2r, wk2i, wk3r, wk3i,
+        x0r, x0i, x1r, x1i, x2r, x2i,
+        y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i,
+        y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i,
+        y8r, y8i, y9r, y9i, y10r, y10i, y11r, y11i,
+        y12r, y12i, y13r, y13i, y14r, y14i, y15r, y15i;
+
+    wn4r = w[1];
+    wk1r = w[4];
+    wk1i = w[5];
+    wk3r = w[6];
+    wk3i = -w[7];
+    wk2r = w[8];
+    wk2i = w[9];
+    x1r = a[0] - a[17];
+    x1i = a[1] + a[16];
+    x0r = a[8] - a[25];
+    x0i = a[9] + a[24];
+    x2r = wn4r * (x0r - x0i);
+    x2i = wn4r * (x0i + x0r);
+    y0r = x1r + x2r;
+    y0i = x1i + x2i;
+    y4r = x1r - x2r;
+    y4i = x1i - x2i;
+    x1r = a[0] + a[17];
+    x1i = a[1] - a[16];
+    x0r = a[8] + a[25];
+    x0i = a[9] - a[24];
+    x2r = wn4r * (x0r - x0i);
+    x2i = wn4r * (x0i + x0r);
+    y8r = x1r - x2i;
+    y8i = x1i + x2r;
+    y12r = x1r + x2i;
+    y12i = x1i - x2r;
+    x0r = a[2] - a[19];
+    x0i = a[3] + a[18];
+    x1r = wk1r * x0r - wk1i * x0i;
+    x1i = wk1r * x0i + wk1i * x0r;
+    x0r = a[10] - a[27];
+    x0i = a[11] + a[26];
+    x2r = wk3i * x0r - wk3r * x0i;
+    x2i = wk3i * x0i + wk3r * x0r;
+    y1r = x1r + x2r;
+    y1i = x1i + x2i;
+    y5r = x1r - x2r;
+    y5i = x1i - x2i;
+    x0r = a[2] + a[19];
+    x0i = a[3] - a[18];
+    x1r = wk3r * x0r - wk3i * x0i;
+    x1i = wk3r * x0i + wk3i * x0r;
+    x0r = a[10] + a[27];
+    x0i = a[11] - a[26];
+    x2r = wk1r * x0r + wk1i * x0i;
+    x2i = wk1r * x0i - wk1i * x0r;
+    y9r = x1r - x2r;
+    y9i = x1i - x2i;
+    y13r = x1r + x2r;
+    y13i = x1i + x2i;
+    x0r = a[4] - a[21];
+    x0i = a[5] + a[20];
+    x1r = wk2r * x0r - wk2i * x0i;
+    x1i = wk2r * x0i + wk2i * x0r;
+    x0r = a[12] - a[29];
+    x0i = a[13] + a[28];
+    x2r = wk2i * x0r - wk2r * x0i;
+    x2i = wk2i * x0i + wk2r * x0r;
+    y2r = x1r + x2r;
+    y2i = x1i + x2i;
+    y6r = x1r - x2r;
+    y6i = x1i - x2i;
+    x0r = a[4] + a[21];
+    x0i = a[5] - a[20];
+    x1r = wk2i * x0r - wk2r * x0i;
+    x1i = wk2i * x0i + wk2r * x0r;
+    x0r = a[12] + a[29];
+    x0i = a[13] - a[28];
+    x2r = wk2r * x0r - wk2i * x0i;
+    x2i = wk2r * x0i + wk2i * x0r;
+    y10r = x1r - x2r;
+    y10i = x1i - x2i;
+    y14r = x1r + x2r;
+    y14i = x1i + x2i;
+    x0r = a[6] - a[23];
+    x0i = a[7] + a[22];
+    x1r = wk3r * x0r - wk3i * x0i;
+    x1i = wk3r * x0i + wk3i * x0r;
+    x0r = a[14] - a[31];
+    x0i = a[15] + a[30];
+    x2r = wk1i * x0r - wk1r * x0i;
+    x2i = wk1i * x0i + wk1r * x0r;
+    y3r = x1r + x2r;
+    y3i = x1i + x2i;
+    y7r = x1r - x2r;
+    y7i = x1i - x2i;
+    x0r = a[6] + a[23];
+    x0i = a[7] - a[22];
+    x1r = wk1i * x0r + wk1r * x0i;
+    x1i = wk1i * x0i - wk1r * x0r;
+    x0r = a[14] + a[31];
+    x0i = a[15] - a[30];
+    x2r = wk3i * x0r - wk3r * x0i;
+    x2i = wk3i * x0i + wk3r * x0r;
+    y11r = x1r + x2r;
+    y11i = x1i + x2i;
+    y15r = x1r - x2r;
+    y15i = x1i - x2i;
+    x1r = y0r + y2r;
+    x1i = y0i + y2i;
+    x2r = y1r + y3r;
+    x2i = y1i + y3i;
+    a[0] = x1r + x2r;
+    a[1] = x1i + x2i;
+    a[2] = x1r - x2r;
+    a[3] = x1i - x2i;
+    x1r = y0r - y2r;
+    x1i = y0i - y2i;
+    x2r = y1r - y3r;
+    x2i = y1i - y3i;
+    a[4] = x1r - x2i;
+    a[5] = x1i + x2r;
+    a[6] = x1r + x2i;
+    a[7] = x1i - x2r;
+    x1r = y4r - y6i;
+    x1i = y4i + y6r;
+    x0r = y5r - y7i;
+    x0i = y5i + y7r;
+    x2r = wn4r * (x0r - x0i);
+    x2i = wn4r * (x0i + x0r);
+    a[8] = x1r + x2r;
+    a[9] = x1i + x2i;
+    a[10] = x1r - x2r;
+    a[11] = x1i - x2i;
+    x1r = y4r + y6i;
+    x1i = y4i - y6r;
+    x0r = y5r + y7i;
+    x0i = y5i - y7r;
+    x2r = wn4r * (x0r - x0i);
+    x2i = wn4r * (x0i + x0r);
+    a[12] = x1r - x2i;
+    a[13] = x1i + x2r;
+    a[14] = x1r + x2i;
+    a[15] = x1i - x2r;
+    x1r = y8r + y10r;
+    x1i = y8i + y10i;
+    x2r = y9r - y11r;
+    x2i = y9i - y11i;
+    a[16] = x1r + x2r;
+    a[17] = x1i + x2i;
+    a[18] = x1r - x2r;
+    a[19] = x1i - x2i;
+    x1r = y8r - y10r;
+    x1i = y8i - y10i;
+    x2r = y9r + y11r;
+    x2i = y9i + y11i;
+    a[20] = x1r - x2i;
+    a[21] = x1i + x2r;
+    a[22] = x1r + x2i;
+    a[23] = x1i - x2r;
+    x1r = y12r - y14i;
+    x1i = y12i + y14r;
+    x0r = y13r + y15i;
+    x0i = y13i - y15r;
+    x2r = wn4r * (x0r - x0i);
+    x2i = wn4r * (x0i + x0r);
+    a[24] = x1r + x2r;
+    a[25] = x1i + x2i;
+    a[26] = x1r - x2r;
+    a[27] = x1i - x2i;
+    x1r = y12r + y14i;
+    x1i = y12i - y14r;
+    x0r = y13r - y15i;
+    x0i = y13i + y15r;
+    x2r = wn4r * (x0r - x0i);
+    x2i = wn4r * (x0i + x0r);
+    a[28] = x1r - x2i;
+    a[29] = x1i + x2r;
+    a[30] = x1r + x2i;
+    a[31] = x1i - x2r;
+}
+
+
+void cftf081(double *a, double *w)
+{
+    double wn4r, x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i,
+        y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i,
+        y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i;
+
+    wn4r = w[1];
+    x0r = a[0] + a[8];
+    x0i = a[1] + a[9];
+    x1r = a[0] - a[8];
+    x1i = a[1] - a[9];
+    x2r = a[4] + a[12];
+    x2i = a[5] + a[13];
+    x3r = a[4] - a[12];
+    x3i = a[5] - a[13];
+    y0r = x0r + x2r;
+    y0i = x0i + x2i;
+    y2r = x0r - x2r;
+    y2i = x0i - x2i;
+    y1r = x1r - x3i;
+    y1i = x1i + x3r;
+    y3r = x1r + x3i;
+    y3i = x1i - x3r;
+    x0r = a[2] + a[10];
+    x0i = a[3] + a[11];
+    x1r = a[2] - a[10];
+    x1i = a[3] - a[11];
+    x2r = a[6] + a[14];
+    x2i = a[7] + a[15];
+    x3r = a[6] - a[14];
+    x3i = a[7] - a[15];
+    y4r = x0r + x2r;
+    y4i = x0i + x2i;
+    y6r = x0r - x2r;
+    y6i = x0i - x2i;
+    x0r = x1r - x3i;
+    x0i = x1i + x3r;
+    x2r = x1r + x3i;
+    x2i = x1i - x3r;
+    y5r = wn4r * (x0r - x0i);
+    y5i = wn4r * (x0r + x0i);
+    y7r = wn4r * (x2r - x2i);
+    y7i = wn4r * (x2r + x2i);
+    a[8] = y1r + y5r;
+    a[9] = y1i + y5i;
+    a[10] = y1r - y5r;
+    a[11] = y1i - y5i;
+    a[12] = y3r - y7i;
+    a[13] = y3i + y7r;
+    a[14] = y3r + y7i;
+    a[15] = y3i - y7r;
+    a[0] = y0r + y4r;
+    a[1] = y0i + y4i;
+    a[2] = y0r - y4r;
+    a[3] = y0i - y4i;
+    a[4] = y2r - y6i;
+    a[5] = y2i + y6r;
+    a[6] = y2r + y6i;
+    a[7] = y2i - y6r;
+}
+
+
+void cftf082(double *a, double *w)
+{
+    double wn4r, wk1r, wk1i, x0r, x0i, x1r, x1i,
+        y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i,
+        y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i;
+
+    wn4r = w[1];
+    wk1r = w[2];
+    wk1i = w[3];
+    y0r = a[0] - a[9];
+    y0i = a[1] + a[8];
+    y1r = a[0] + a[9];
+    y1i = a[1] - a[8];
+    x0r = a[4] - a[13];
+    x0i = a[5] + a[12];
+    y2r = wn4r * (x0r - x0i);
+    y2i = wn4r * (x0i + x0r);
+    x0r = a[4] + a[13];
+    x0i = a[5] - a[12];
+    y3r = wn4r * (x0r - x0i);
+    y3i = wn4r * (x0i + x0r);
+    x0r = a[2] - a[11];
+    x0i = a[3] + a[10];
+    y4r = wk1r * x0r - wk1i * x0i;
+    y4i = wk1r * x0i + wk1i * x0r;
+    x0r = a[2] + a[11];
+    x0i = a[3] - a[10];
+    y5r = wk1i * x0r - wk1r * x0i;
+    y5i = wk1i * x0i + wk1r * x0r;
+    x0r = a[6] - a[15];
+    x0i = a[7] + a[14];
+    y6r = wk1i * x0r - wk1r * x0i;
+    y6i = wk1i * x0i + wk1r * x0r;
+    x0r = a[6] + a[15];
+    x0i = a[7] - a[14];
+    y7r = wk1r * x0r - wk1i * x0i;
+    y7i = wk1r * x0i + wk1i * x0r;
+    x0r = y0r + y2r;
+    x0i = y0i + y2i;
+    x1r = y4r + y6r;
+    x1i = y4i + y6i;
+    a[0] = x0r + x1r;
+    a[1] = x0i + x1i;
+    a[2] = x0r - x1r;
+    a[3] = x0i - x1i;
+    x0r = y0r - y2r;
+    x0i = y0i - y2i;
+    x1r = y4r - y6r;
+    x1i = y4i - y6i;
+    a[4] = x0r - x1i;
+    a[5] = x0i + x1r;
+    a[6] = x0r + x1i;
+    a[7] = x0i - x1r;
+    x0r = y1r - y3i;
+    x0i = y1i + y3r;
+    x1r = y5r - y7r;
+    x1i = y5i - y7i;
+    a[8] = x0r + x1r;
+    a[9] = x0i + x1i;
+    a[10] = x0r - x1r;
+    a[11] = x0i - x1i;
+    x0r = y1r + y3i;
+    x0i = y1i - y3r;
+    x1r = y5r + y7r;
+    x1i = y5i + y7i;
+    a[12] = x0r - x1i;
+    a[13] = x0i + x1r;
+    a[14] = x0r + x1i;
+    a[15] = x0i - x1r;
+}
+
+
+void cftf040(double *a)
+{
+    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+
+    x0r = a[0] + a[4];
+    x0i = a[1] + a[5];
+    x1r = a[0] - a[4];
+    x1i = a[1] - a[5];
+    x2r = a[2] + a[6];
+    x2i = a[3] + a[7];
+    x3r = a[2] - a[6];
+    x3i = a[3] - a[7];
+    a[0] = x0r + x2r;
+    a[1] = x0i + x2i;
+    a[2] = x1r - x3i;
+    a[3] = x1i + x3r;
+    a[4] = x0r - x2r;
+    a[5] = x0i - x2i;
+    a[6] = x1r + x3i;
+    a[7] = x1i - x3r;
+}
+
+
+void cftb040(double *a)
+{
+    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+
+    x0r = a[0] + a[4];
+    x0i = a[1] + a[5];
+    x1r = a[0] - a[4];
+    x1i = a[1] - a[5];
+    x2r = a[2] + a[6];
+    x2i = a[3] + a[7];
+    x3r = a[2] - a[6];
+    x3i = a[3] - a[7];
+    a[0] = x0r + x2r;
+    a[1] = x0i + x2i;
+    a[2] = x1r + x3i;
+    a[3] = x1i - x3r;
+    a[4] = x0r - x2r;
+    a[5] = x0i - x2i;
+    a[6] = x1r - x3i;
+    a[7] = x1i + x3r;
+}
+
+
+void cftx020(double *a)
+{
+    double x0r, x0i;
+
+    x0r = a[0] - a[2];
+    x0i = a[1] - a[3];
+    a[0] += a[2];
+    a[1] += a[3];
+    a[2] = x0r;
+    a[3] = x0i;
+}
+
+
+void rftfsub(int n, double *a, int nc, double *c)
+{
+    int j, k, kk, ks, m;
+    double wkr, wki, xr, xi, yr, yi;
+
+    m = n >> 1;
+    ks = 2 * nc / m;
+    kk = 0;
+    for (j = 2; j < m; j += 2) {
+        k = n - j;
+        kk += ks;
+        wkr = 0.5 - c[nc - kk];
+        wki = c[kk];
+        xr = a[j] - a[k];
+        xi = a[j + 1] + a[k + 1];
+        yr = wkr * xr - wki * xi;
+        yi = wkr * xi + wki * xr;
+        a[j] -= yr;
+        a[j + 1] -= yi;
+        a[k] += yr;
+        a[k + 1] -= yi;
+    }
+}
+
+
+void rftbsub(int n, double *a, int nc, double *c)
+{
+    int j, k, kk, ks, m;
+    double wkr, wki, xr, xi, yr, yi;
+
+    m = n >> 1;
+    ks = 2 * nc / m;
+    kk = 0;
+    for (j = 2; j < m; j += 2) {
+        k = n - j;
+        kk += ks;
+        wkr = 0.5 - c[nc - kk];
+        wki = c[kk];
+        xr = a[j] - a[k];
+        xi = a[j + 1] + a[k + 1];
+        yr = wkr * xr + wki * xi;
+        yi = wkr * xi - wki * xr;
+        a[j] -= yr;
+        a[j + 1] -= yi;
+        a[k] += yr;
+        a[k + 1] -= yi;
+    }
+}
+
+

+ 142 - 0
ggml/examples/kaldi-native-fbank/csrc/log.cc

@@ -0,0 +1,142 @@
+/**
+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Stack trace related stuff is from kaldi.
+ * Refer to
+ * https://github.com/kaldi-asr/kaldi/blob/master/src/base/kaldi-error.cc
+ */
+
+#include "log.h"
+
+#ifdef KNF_HAVE_EXECINFO_H
+#include <execinfo.h>  // To get stack trace in error messages.
+#ifdef KNF_HAVE_CXXABI_H
+#include <cxxabi.h>  // For name demangling.
+// Useful to decode the stack trace, but only used if we have execinfo.h
+#endif  // KNF_HAVE_CXXABI_H
+#endif  // KNF_HAVE_EXECINFO_H
+
+#include <stdlib.h>
+
+#include <ctime>
+#include <iomanip>
+#include <string>
+
+namespace knf {
+
+std::string GetDateTimeStr() {
+  std::ostringstream os;
+  std::time_t t = std::time(nullptr);
+  std::tm tm = *std::localtime(&t);
+  os << std::put_time(&tm, "%F %T");  // yyyy-mm-dd hh:mm:ss
+  return os.str();
+}
+
+static bool LocateSymbolRange(const std::string &trace_name, std::size_t *begin,
+                              std::size_t *end) {
+  // Find the first '_' with leading ' ' or '('.
+  *begin = std::string::npos;
+  for (std::size_t i = 1; i < trace_name.size(); ++i) {
+    if (trace_name[i] != '_') {
+      continue;
+    }
+    if (trace_name[i - 1] == ' ' || trace_name[i - 1] == '(') {
+      *begin = i;
+      break;
+    }
+  }
+  if (*begin == std::string::npos) {
+    return false;
+  }
+  *end = trace_name.find_first_of(" +", *begin);
+  return *end != std::string::npos;
+}
+
+#ifdef KNF_HAVE_EXECINFO_H
+static std::string Demangle(const std::string &trace_name) {
+#ifndef KNF_HAVE_CXXABI_H
+  return trace_name;
+#else   // KNF_HAVE_CXXABI_H
+  // Try demangle the symbol. We are trying to support the following formats
+  // produced by different platforms:
+  //
+  // Linux:
+  //   ./kaldi-error-test(_ZN5kaldi13UnitTestErrorEv+0xb) [0x804965d]
+  //
+  // Mac:
+  //   0 server 0x000000010f67614d _ZNK5kaldi13MessageLogger10LogMessageEv + 813
+  //
+  // We want to extract the name e.g., '_ZN5kaldi13UnitTestErrorEv' and
+  // demangle it info a readable name like kaldi::UnitTextError.
+  std::size_t begin, end;
+  if (!LocateSymbolRange(trace_name, &begin, &end)) {
+    return trace_name;
+  }
+  std::string symbol = trace_name.substr(begin, end - begin);
+  int status;
+  char *demangled_name = abi::__cxa_demangle(symbol.c_str(), 0, 0, &status);
+  if (status == 0 && demangled_name != nullptr) {
+    symbol = demangled_name;
+    free(demangled_name);
+  }
+  return trace_name.substr(0, begin) + symbol +
+         trace_name.substr(end, std::string::npos);
+#endif  // KNF_HAVE_CXXABI_H
+}
+#endif  // KNF_HAVE_EXECINFO_H
+
+std::string GetStackTrace() {
+  std::string ans;
+#ifdef KNF_HAVE_EXECINFO_H
+  constexpr const std::size_t kMaxTraceSize = 50;
+  constexpr const std::size_t kMaxTracePrint = 50;  // Must be even.
+                                                    // Buffer for the trace.
+  void *trace[kMaxTraceSize];
+  // Get the trace.
+  std::size_t size = backtrace(trace, kMaxTraceSize);
+  // Get the trace symbols.
+  char **trace_symbol = backtrace_symbols(trace, size);
+  if (trace_symbol == nullptr) return ans;
+
+  // Compose a human-readable backtrace string.
+  ans += "[ Stack-Trace: ]\n";
+  if (size <= kMaxTracePrint) {
+    for (std::size_t i = 0; i < size; ++i) {
+      ans += Demangle(trace_symbol[i]) + "\n";
+    }
+  } else {  // Print out first+last (e.g.) 5.
+    for (std::size_t i = 0; i < kMaxTracePrint / 2; ++i) {
+      ans += Demangle(trace_symbol[i]) + "\n";
+    }
+    ans += ".\n.\n.\n";
+    for (std::size_t i = size - kMaxTracePrint / 2; i < size; ++i) {
+      ans += Demangle(trace_symbol[i]) + "\n";
+    }
+    if (size == kMaxTraceSize)
+      ans += ".\n.\n.\n";  // Stack was too long, probably a bug.
+  }
+
+  // We must free the array of pointers allocated by backtrace_symbols(),
+  // but not the strings themselves.
+  free(trace_symbol);
+#endif  // KNF_HAVE_EXECINFO_H
+  return ans;
+}
+
+}  // namespace knf

+ 383 - 0
ggml/examples/kaldi-native-fbank/csrc/log.h

@@ -0,0 +1,383 @@
+/**
+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// The content in this file is copied/modified from
+// https://github.com/k2-fsa/k2/blob/master/k2/csrc/log.h
+#ifndef KALDI_NATIVE_FBANK_CSRC_LOG_H_
+#define KALDI_NATIVE_FBANK_CSRC_LOG_H_
+
+#include <stdio.h>
+
+#include <mutex>  // NOLINT
+#include <sstream>
+#include <string>
+
+namespace knf {
+
+#if KNF_ENABLE_CHECK
+
+#if defined(NDEBUG)
+constexpr bool kDisableDebug = true;
+#else
+constexpr bool kDisableDebug = false;
+#endif
+
+enum class LogLevel {
+  kTrace = 0,
+  kDebug = 1,
+  kInfo = 2,
+  kWarning = 3,
+  kError = 4,
+  kFatal = 5,  // print message and abort the program
+};
+
+// They are used in KNF_LOG(xxx), so their names
+// do not follow the google c++ code style
+//
+// You can use them in the following way:
+//
+//  KNF_LOG(TRACE) << "some message";
+//  KNF_LOG(DEBUG) << "some message";
+#ifndef _MSC_VER
+constexpr LogLevel TRACE = LogLevel::kTrace;
+constexpr LogLevel DEBUG = LogLevel::kDebug;
+constexpr LogLevel INFO = LogLevel::kInfo;
+constexpr LogLevel WARNING = LogLevel::kWarning;
+constexpr LogLevel ERROR = LogLevel::kError;
+constexpr LogLevel FATAL = LogLevel::kFatal;
+#else
+#define TRACE LogLevel::kTrace
+#define DEBUG LogLevel::kDebug
+#define INFO LogLevel::kInfo
+#define WARNING LogLevel::kWarning
+#define ERROR LogLevel::kError
+#define FATAL LogLevel::kFatal
+#endif
+
+std::string GetStackTrace();
+
+/* Return the current log level.
+
+
+   If the current log level is TRACE, then all logged messages are printed out.
+
+   If the current log level is DEBUG, log messages with "TRACE" level are not
+   shown and all other levels are printed out.
+
+   Similarly, if the current log level is INFO, log message with "TRACE" and
+   "DEBUG" are not shown and all other levels are printed out.
+
+   If it is FATAL, then only FATAL messages are shown.
+ */
+inline LogLevel GetCurrentLogLevel() {
+  static LogLevel log_level = INFO;
+  static std::once_flag init_flag;
+  std::call_once(init_flag, []() {
+    const char *env_log_level = std::getenv("KNF_LOG_LEVEL");
+    if (env_log_level == nullptr) return;
+
+    std::string s = env_log_level;
+    if (s == "TRACE")
+      log_level = TRACE;
+    else if (s == "DEBUG")
+      log_level = DEBUG;
+    else if (s == "INFO")
+      log_level = INFO;
+    else if (s == "WARNING")
+      log_level = WARNING;
+    else if (s == "ERROR")
+      log_level = ERROR;
+    else if (s == "FATAL")
+      log_level = FATAL;
+    else
+      fprintf(stderr,
+              "Unknown KNF_LOG_LEVEL: %s"
+              "\nSupported values are: "
+              "TRACE, DEBUG, INFO, WARNING, ERROR, FATAL",
+              s.c_str());
+  });
+  return log_level;
+}
+
+inline bool EnableAbort() {
+  static std::once_flag init_flag;
+  static bool enable_abort = false;
+  std::call_once(init_flag, []() {
+    enable_abort = (std::getenv("KNF_ABORT") != nullptr);
+  });
+  return enable_abort;
+}
+
+class Logger {
+ public:
+  Logger(const char *filename, const char *func_name, uint32_t line_num,
+         LogLevel level)
+      : filename_(filename),
+        func_name_(func_name),
+        line_num_(line_num),
+        level_(level) {
+    cur_level_ = GetCurrentLogLevel();
+    fprintf(stderr, "here\n");
+    switch (level) {
+      case TRACE:
+        if (cur_level_ <= TRACE) fprintf(stderr, "[T] ");
+        break;
+      case DEBUG:
+        if (cur_level_ <= DEBUG) fprintf(stderr, "[D] ");
+        break;
+      case INFO:
+        if (cur_level_ <= INFO) fprintf(stderr, "[I] ");
+        break;
+      case WARNING:
+        if (cur_level_ <= WARNING) fprintf(stderr, "[W] ");
+        break;
+      case ERROR:
+        if (cur_level_ <= ERROR) fprintf(stderr, "[E] ");
+        break;
+      case FATAL:
+        if (cur_level_ <= FATAL) fprintf(stderr, "[F] ");
+        break;
+    }
+
+    if (cur_level_ <= level_) {
+      fprintf(stderr, "%s:%u:%s ", filename, line_num, func_name);
+    }
+  }
+
+  ~Logger() noexcept(false) {
+    static constexpr const char *kErrMsg = R"(
+    Some bad things happened. Please read the above error messages and stack
+    trace. If you are using Python, the following command may be helpful:
+
+      gdb --args python /path/to/your/code.py
+
+    (You can use `gdb` to debug the code. Please consider compiling
+    a debug version of KNF.).
+
+    If you are unable to fix it, please open an issue at:
+
+      https://github.com/csukuangfj/kaldi-native-fbank/issues/new
+    )";
+    fprintf(stderr, "\n");
+    if (level_ == FATAL) {
+      std::string stack_trace = GetStackTrace();
+      if (!stack_trace.empty()) {
+        fprintf(stderr, "\n\n%s\n", stack_trace.c_str());
+      }
+
+      fflush(nullptr);
+
+#ifndef __ANDROID_API__
+      if (EnableAbort()) {
+        // NOTE: abort() will terminate the program immediately without
+        // printing the Python stack backtrace.
+        abort();
+      }
+
+      throw std::runtime_error(kErrMsg);
+#else
+      abort();
+#endif
+    }
+  }
+
+  const Logger &operator<<(bool b) const {
+    if (cur_level_ <= level_) {
+      fprintf(stderr, b ? "true" : "false");
+    }
+    return *this;
+  }
+
+  const Logger &operator<<(int8_t i) const {
+    if (cur_level_ <= level_) fprintf(stderr, "%d", i);
+    return *this;
+  }
+
+  const Logger &operator<<(const char *s) const {
+    if (cur_level_ <= level_) fprintf(stderr, "%s", s);
+    return *this;
+  }
+
+  const Logger &operator<<(int32_t i) const {
+    if (cur_level_ <= level_) fprintf(stderr, "%d", i);
+    return *this;
+  }
+
+  const Logger &operator<<(uint32_t i) const {
+    if (cur_level_ <= level_) fprintf(stderr, "%u", i);
+    return *this;
+  }
+
+  const Logger &operator<<(uint64_t i) const {
+    if (cur_level_ <= level_)
+      fprintf(stderr, "%llu", (long long unsigned int)i);  // NOLINT
+    return *this;
+  }
+
+  const Logger &operator<<(int64_t i) const {
+    if (cur_level_ <= level_)
+      fprintf(stderr, "%lli", (long long int)i);  // NOLINT
+    return *this;
+  }
+
+  const Logger &operator<<(float f) const {
+    if (cur_level_ <= level_) fprintf(stderr, "%f", f);
+    return *this;
+  }
+
+  const Logger &operator<<(double d) const {
+    if (cur_level_ <= level_) fprintf(stderr, "%f", d);
+    return *this;
+  }
+
+  template <typename T>
+  const Logger &operator<<(const T &t) const {
+    // require T overloads operator<<
+    std::ostringstream os;
+    os << t;
+    return *this << os.str().c_str();
+  }
+
+  // specialization to fix compile error: `stringstream << nullptr` is ambiguous
+  const Logger &operator<<(const std::nullptr_t &null) const {
+    if (cur_level_ <= level_) *this << "(null)";
+    return *this;
+  }
+
+ private:
+  const char *filename_;
+  const char *func_name_;
+  uint32_t line_num_;
+  LogLevel level_;
+  LogLevel cur_level_;
+};
+#endif  // KNF_ENABLE_CHECK
+
+class Voidifier {
+ public:
+#if KNF_ENABLE_CHECK
+  void operator&(const Logger &) const {}
+#endif
+};
+#if !defined(KNF_ENABLE_CHECK)
+template <typename T>
+const Voidifier &operator<<(const Voidifier &v, T &&) {
+  return v;
+}
+#endif
+
+}  // namespace knf
+
+#define KNF_STATIC_ASSERT(x) static_assert(x, "")
+
+#ifdef KNF_ENABLE_CHECK
+
+#if defined(__clang__) || defined(__GNUC__) || defined(__GNUG__) || \
+    defined(__PRETTY_FUNCTION__)
+// for clang and GCC
+#define KNF_FUNC __PRETTY_FUNCTION__
+#else
+// for other compilers
+#define KNF_FUNC __func__
+#endif
+
+#define KNF_CHECK(x)                                                  \
+  (x) ? (void)0                                                       \
+      : ::knf::Voidifier() &                                          \
+            ::knf::Logger(__FILE__, KNF_FUNC, __LINE__, ::knf::FATAL) \
+                << "Check failed: " << #x << " "
+
+// WARNING: x and y may be evaluated multiple times, but this happens only
+// when the check fails. Since the program aborts if it fails, we don't think
+// the extra evaluation of x and y matters.
+//
+// CAUTION: we recommend the following use case:
+//
+//      auto x = Foo();
+//      auto y = Bar();
+//      KNF_CHECK_EQ(x, y) << "Some message";
+//
+//  And please avoid
+//
+//      KNF_CHECK_EQ(Foo(), Bar());
+//
+//  if `Foo()` or `Bar()` causes some side effects, e.g., changing some
+//  local static variables or global variables.
+#define _KNF_CHECK_OP(x, y, op)                                              \
+  ((x)op(y)) ? (void)0                                                       \
+             : ::knf::Voidifier() &                                          \
+                   ::knf::Logger(__FILE__, KNF_FUNC, __LINE__, ::knf::FATAL) \
+                       << "Check failed: " << #x << " " << #op << " " << #y  \
+                       << " (" << (x) << " vs. " << (y) << ") "
+
+#define KNF_CHECK_EQ(x, y) _KNF_CHECK_OP(x, y, ==)
+#define KNF_CHECK_NE(x, y) _KNF_CHECK_OP(x, y, !=)
+#define KNF_CHECK_LT(x, y) _KNF_CHECK_OP(x, y, <)
+#define KNF_CHECK_LE(x, y) _KNF_CHECK_OP(x, y, <=)
+#define KNF_CHECK_GT(x, y) _KNF_CHECK_OP(x, y, >)
+#define KNF_CHECK_GE(x, y) _KNF_CHECK_OP(x, y, >=)
+
+#define KNF_LOG(x) ::knf::Logger(__FILE__, KNF_FUNC, __LINE__, ::knf::x)
+
+// ------------------------------------------------------------
+//       For debug check
+// ------------------------------------------------------------
+// If you define the macro "-D NDEBUG" while compiling kaldi-native-fbank,
+// the following macros are in fact empty and does nothing.
+
+#define KNF_DCHECK(x) ::knf::kDisableDebug ? (void)0 : KNF_CHECK(x)
+
+#define KNF_DCHECK_EQ(x, y) ::knf::kDisableDebug ? (void)0 : KNF_CHECK_EQ(x, y)
+
+#define KNF_DCHECK_NE(x, y) ::knf::kDisableDebug ? (void)0 : KNF_CHECK_NE(x, y)
+
+#define KNF_DCHECK_LT(x, y) ::knf::kDisableDebug ? (void)0 : KNF_CHECK_LT(x, y)
+
+#define KNF_DCHECK_LE(x, y) ::knf::kDisableDebug ? (void)0 : KNF_CHECK_LE(x, y)
+
+#define KNF_DCHECK_GT(x, y) ::knf::kDisableDebug ? (void)0 : KNF_CHECK_GT(x, y)
+
+#define KNF_DCHECK_GE(x, y) ::knf::kDisableDebug ? (void)0 : KNF_CHECK_GE(x, y)
+
+#define KNF_DLOG(x) \
+  ::knf::kDisableDebug ? (void)0 : ::knf::Voidifier() & KNF_LOG(x)
+
+#else
+
+#define KNF_CHECK(x) ::knf::Voidifier()
+#define KNF_LOG(x) ::knf::Voidifier()
+
+#define KNF_CHECK_EQ(x, y) ::knf::Voidifier()
+#define KNF_CHECK_NE(x, y) ::knf::Voidifier()
+#define KNF_CHECK_LT(x, y) ::knf::Voidifier()
+#define KNF_CHECK_LE(x, y) ::knf::Voidifier()
+#define KNF_CHECK_GT(x, y) ::knf::Voidifier()
+#define KNF_CHECK_GE(x, y) ::knf::Voidifier()
+
+#define KNF_DCHECK(x) ::knf::Voidifier()
+#define KNF_DLOG(x) ::knf::Voidifier()
+#define KNF_DCHECK_EQ(x, y) ::knf::Voidifier()
+#define KNF_DCHECK_NE(x, y) ::knf::Voidifier()
+#define KNF_DCHECK_LT(x, y) ::knf::Voidifier()
+#define KNF_DCHECK_LE(x, y) ::knf::Voidifier()
+#define KNF_DCHECK_GT(x, y) ::knf::Voidifier()
+#define KNF_DCHECK_GE(x, y) ::knf::Voidifier()
+
+#endif  // KNF_CHECK_NE
+
+#endif  // KALDI_NATIVE_FBANK_CSRC_LOG_H_

+ 257 - 0
ggml/examples/kaldi-native-fbank/csrc/mel-computations.cc

@@ -0,0 +1,257 @@
+/**
+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// This file is copied/modified from kaldi/src/feat/mel-computations.cc
+
+#include "mel-computations.h"
+
+#include <algorithm>
+#include <sstream>
+#include <vector>
+
+#include "feature-window.h"
+
+namespace knf {
+
+std::ostream &operator<<(std::ostream &os, const MelBanksOptions &opts) {
+  os << opts.ToString();
+  return os;
+}
+
+float MelBanks::VtlnWarpFreq(
+    float vtln_low_cutoff,  // upper+lower frequency cutoffs for VTLN.
+    float vtln_high_cutoff,
+    float low_freq,  // upper+lower frequency cutoffs in mel computation
+    float high_freq, float vtln_warp_factor, float freq) {
+  /// This computes a VTLN warping function that is not the same as HTK's one,
+  /// but has similar inputs (this function has the advantage of never producing
+  /// empty bins).
+
+  /// This function computes a warp function F(freq), defined between low_freq
+  /// and high_freq inclusive, with the following properties:
+  ///  F(low_freq) == low_freq
+  ///  F(high_freq) == high_freq
+  /// The function is continuous and piecewise linear with two inflection
+  ///   points.
+  /// The lower inflection point (measured in terms of the unwarped
+  ///  frequency) is at frequency l, determined as described below.
+  /// The higher inflection point is at a frequency h, determined as
+  ///   described below.
+  /// If l <= f <= h, then F(f) = f/vtln_warp_factor.
+  /// If the higher inflection point (measured in terms of the unwarped
+  ///   frequency) is at h, then max(h, F(h)) == vtln_high_cutoff.
+  ///   Since (by the last point) F(h) == h/vtln_warp_factor, then
+  ///   max(h, h/vtln_warp_factor) == vtln_high_cutoff, so
+  ///   h = vtln_high_cutoff / max(1, 1/vtln_warp_factor).
+  ///     = vtln_high_cutoff * min(1, vtln_warp_factor).
+  /// If the lower inflection point (measured in terms of the unwarped
+  ///   frequency) is at l, then min(l, F(l)) == vtln_low_cutoff
+  ///   This implies that l = vtln_low_cutoff / min(1, 1/vtln_warp_factor)
+  ///                       = vtln_low_cutoff * max(1, vtln_warp_factor)
+
+  if (freq < low_freq || freq > high_freq)
+    return freq;  // in case this gets called
+  // for out-of-range frequencies, just return the freq.
+
+  KNF_CHECK_GT(vtln_low_cutoff, low_freq);
+  KNF_CHECK_LT(vtln_high_cutoff, high_freq);
+
+  float one = 1.0f;
+  float l = vtln_low_cutoff * std::max(one, vtln_warp_factor);
+  float h = vtln_high_cutoff * std::min(one, vtln_warp_factor);
+  float scale = 1.0f / vtln_warp_factor;
+  float Fl = scale * l;  // F(l);
+  float Fh = scale * h;  // F(h);
+  KNF_CHECK(l > low_freq && h < high_freq);
+  // slope of left part of the 3-piece linear function
+  float scale_left = (Fl - low_freq) / (l - low_freq);
+  // [slope of center part is just "scale"]
+
+  // slope of right part of the 3-piece linear function
+  float scale_right = (high_freq - Fh) / (high_freq - h);
+
+  if (freq < l) {
+    return low_freq + scale_left * (freq - low_freq);
+  } else if (freq < h) {
+    return scale * freq;
+  } else {  // freq >= h
+    return high_freq + scale_right * (freq - high_freq);
+  }
+}
+
+float MelBanks::VtlnWarpMelFreq(
+    float vtln_low_cutoff,  // upper+lower frequency cutoffs for VTLN.
+    float vtln_high_cutoff,
+    float low_freq,  // upper+lower frequency cutoffs in mel computation
+    float high_freq, float vtln_warp_factor, float mel_freq) {
+  return MelScale(VtlnWarpFreq(vtln_low_cutoff, vtln_high_cutoff, low_freq,
+                               high_freq, vtln_warp_factor,
+                               InverseMelScale(mel_freq)));
+}
+
+MelBanks::MelBanks(const MelBanksOptions &opts,
+                   const FrameExtractionOptions &frame_opts,
+                   float vtln_warp_factor)
+    : htk_mode_(opts.htk_mode) {
+  int32_t num_bins = opts.num_bins;
+  if (num_bins < 3) KNF_LOG(FATAL) << "Must have at least 3 mel bins";
+
+  float sample_freq = frame_opts.samp_freq;
+  int32_t window_length_padded = frame_opts.PaddedWindowSize();
+  KNF_CHECK_EQ(window_length_padded % 2, 0);
+
+  int32_t num_fft_bins = window_length_padded / 2;
+  float nyquist = 0.5f * sample_freq;
+
+  float low_freq = opts.low_freq, high_freq;
+  if (opts.high_freq > 0.0f)
+    high_freq = opts.high_freq;
+  else
+    high_freq = nyquist + opts.high_freq;
+
+  if (low_freq < 0.0f || low_freq >= nyquist || high_freq <= 0.0f ||
+      high_freq > nyquist || high_freq <= low_freq) {
+    KNF_LOG(FATAL) << "Bad values in options: low-freq " << low_freq
+                   << " and high-freq " << high_freq << " vs. nyquist "
+                   << nyquist;
+  }
+
+  float fft_bin_width = sample_freq / window_length_padded;
+  // fft-bin width [think of it as Nyquist-freq / half-window-length]
+
+  float mel_low_freq = MelScale(low_freq);
+  float mel_high_freq = MelScale(high_freq);
+
+  debug_ = opts.debug_mel;
+
+  // divide by num_bins+1 in next line because of end-effects where the bins
+  // spread out to the sides.
+  float mel_freq_delta = (mel_high_freq - mel_low_freq) / (num_bins + 1);
+
+  float vtln_low = opts.vtln_low, vtln_high = opts.vtln_high;
+  if (vtln_high < 0.0f) {
+    vtln_high += nyquist;
+  }
+
+  if (vtln_warp_factor != 1.0f &&
+      (vtln_low < 0.0f || vtln_low <= low_freq || vtln_low >= high_freq ||
+       vtln_high <= 0.0f || vtln_high >= high_freq || vtln_high <= vtln_low)) {
+    KNF_LOG(FATAL) << "Bad values in options: vtln-low " << vtln_low
+                   << " and vtln-high " << vtln_high << ", versus "
+                   << "low-freq " << low_freq << " and high-freq " << high_freq;
+  }
+
+  bins_.resize(num_bins);
+  center_freqs_.resize(num_bins);
+
+  for (int32_t bin = 0; bin < num_bins; ++bin) {
+    float left_mel = mel_low_freq + bin * mel_freq_delta,
+          center_mel = mel_low_freq + (bin + 1) * mel_freq_delta,
+          right_mel = mel_low_freq + (bin + 2) * mel_freq_delta;
+
+    if (vtln_warp_factor != 1.0f) {
+      left_mel = VtlnWarpMelFreq(vtln_low, vtln_high, low_freq, high_freq,
+                                 vtln_warp_factor, left_mel);
+      center_mel = VtlnWarpMelFreq(vtln_low, vtln_high, low_freq, high_freq,
+                                   vtln_warp_factor, center_mel);
+      right_mel = VtlnWarpMelFreq(vtln_low, vtln_high, low_freq, high_freq,
+                                  vtln_warp_factor, right_mel);
+    }
+    center_freqs_[bin] = InverseMelScale(center_mel);
+
+    // this_bin will be a vector of coefficients that is only
+    // nonzero where this mel bin is active.
+    std::vector<float> this_bin(num_fft_bins);
+
+    int32_t first_index = -1, last_index = -1;
+    for (int32_t i = 0; i < num_fft_bins; ++i) {
+      float freq = (fft_bin_width * i);  // Center frequency of this fft
+                                         // bin.
+      float mel = MelScale(freq);
+      if (mel > left_mel && mel < right_mel) {
+        float weight;
+        if (mel <= center_mel)
+          weight = (mel - left_mel) / (center_mel - left_mel);
+        else
+          weight = (right_mel - mel) / (right_mel - center_mel);
+        this_bin[i] = weight;
+        if (first_index == -1) first_index = i;
+        last_index = i;
+      }
+    }
+    KNF_CHECK(first_index != -1 && last_index >= first_index &&
+              "You may have set num_mel_bins too large.");
+
+    bins_[bin].first = first_index;
+    int32_t size = last_index + 1 - first_index;
+    bins_[bin].second.insert(bins_[bin].second.end(),
+                             this_bin.begin() + first_index,
+                             this_bin.begin() + first_index + size);
+
+    // Replicate a bug in HTK, for testing purposes.
+    if (opts.htk_mode && bin == 0 && mel_low_freq != 0.0f) {
+      bins_[bin].second[0] = 0.0;
+    }
+  }  // for (int32_t bin = 0; bin < num_bins; ++bin) {
+
+  if (debug_) {
+    std::ostringstream os;
+    for (size_t i = 0; i < bins_.size(); i++) {
+      os << "bin " << i << ", offset = " << bins_[i].first << ", vec = ";
+      for (auto k : bins_[i].second) os << k << ", ";
+      os << "\n";
+    }
+    KNF_LOG(INFO) << os.str();
+  }
+}
+
+// "power_spectrum" contains fft energies.
+void MelBanks::Compute(const float *power_spectrum,
+                       float *mel_energies_out) const {
+  int32_t num_bins = bins_.size();
+
+  for (int32_t i = 0; i < num_bins; i++) {
+    int32_t offset = bins_[i].first;
+    const auto &v = bins_[i].second;
+    float energy = 0;
+    for (int32_t k = 0; k != v.size(); ++k) {
+      energy += v[k] * power_spectrum[k + offset];
+    }
+
+    // HTK-like flooring- for testing purposes (we prefer dither)
+    if (htk_mode_ && energy < 1.0) {
+      energy = 1.0;
+    }
+
+    mel_energies_out[i] = energy;
+
+    // The following assert was added due to a problem with OpenBlas that
+    // we had at one point (it was a bug in that library).  Just to detect
+    // it early.
+    KNF_CHECK_EQ(energy, energy);  // check that energy is not nan
+  }
+
+  if (debug_) {
+    fprintf(stderr, "MEL BANKS:\n");
+    for (int32_t i = 0; i < num_bins; i++)
+      fprintf(stderr, " %f", mel_energies_out[i]);
+    fprintf(stderr, "\n");
+  }
+}
+
+}  // namespace knf

+ 117 - 0
ggml/examples/kaldi-native-fbank/csrc/mel-computations.h

@@ -0,0 +1,117 @@
+/**
+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+// This file is copied/modified from kaldi/src/feat/mel-computations.h
+#ifndef KALDI_NATIVE_FBANK_CSRC_MEL_COMPUTATIONS_H_
+#define KALDI_NATIVE_FBANK_CSRC_MEL_COMPUTATIONS_H_
+
+#include <cmath>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "feature-window.h"
+
+namespace knf {
+
+struct MelBanksOptions {
+  int32_t num_bins = 25;  // e.g. 25; number of triangular bins
+  float low_freq = 20;    // e.g. 20; lower frequency cutoff
+
+  // an upper frequency cutoff; 0 -> no cutoff, negative
+  // ->added to the Nyquist frequency to get the cutoff.
+  float high_freq = 0;
+
+  float vtln_low = 100;  // vtln lower cutoff of warping function.
+
+  // vtln upper cutoff of warping function: if negative, added
+  // to the Nyquist frequency to get the cutoff.
+  float vtln_high = -500;
+
+  bool debug_mel = false;
+  // htk_mode is a "hidden" config, it does not show up on command line.
+  // Enables more exact compatibility with HTK, for testing purposes.  Affects
+  // mel-energy flooring and reproduces a bug in HTK.
+  bool htk_mode = false;
+
+  std::string ToString() const {
+    std::ostringstream os;
+    os << "num_bins: " << num_bins << "\n";
+    os << "low_freq: " << low_freq << "\n";
+    os << "high_freq: " << high_freq << "\n";
+    os << "vtln_low: " << vtln_low << "\n";
+    os << "vtln_high: " << vtln_high << "\n";
+    os << "debug_mel: " << debug_mel << "\n";
+    os << "htk_mode: " << htk_mode << "\n";
+    return os.str();
+  }
+};
+
+std::ostream &operator<<(std::ostream &os, const MelBanksOptions &opts);
+
+class MelBanks {
+ public:
+  static inline float InverseMelScale(float mel_freq) {
+    return 700.0f * (expf(mel_freq / 1127.0f) - 1.0f);
+  }
+
+  static inline float MelScale(float freq) {
+    return 1127.0f * logf(1.0f + freq / 700.0f);
+  }
+
+  static float VtlnWarpFreq(
+      float vtln_low_cutoff,
+      float vtln_high_cutoff,  // discontinuities in warp func
+      float low_freq,
+      float high_freq,  // upper+lower frequency cutoffs in
+      // the mel computation
+      float vtln_warp_factor, float freq);
+
+  static float VtlnWarpMelFreq(float vtln_low_cutoff, float vtln_high_cutoff,
+                               float low_freq, float high_freq,
+                               float vtln_warp_factor, float mel_freq);
+
+  // TODO(fangjun): Remove vtln_warp_factor
+  MelBanks(const MelBanksOptions &opts,
+           const FrameExtractionOptions &frame_opts, float vtln_warp_factor);
+
+  /// Compute Mel energies (note: not log energies).
+  /// At input, "fft_energies" contains the FFT energies (not log).
+  ///
+  /// @param fft_energies 1-D array of size num_fft_bins/2+1
+  /// @param mel_energies_out  1-D array of size num_mel_bins
+  void Compute(const float *fft_energies, float *mel_energies_out) const;
+
+  int32_t NumBins() const { return bins_.size(); }
+
+ private:
+  // center frequencies of bins, numbered from 0 ... num_bins-1.
+  // Needed by GetCenterFreqs().
+  std::vector<float> center_freqs_;
+
+  // the "bins_" vector is a vector, one for each bin, of a pair:
+  // (the first nonzero fft-bin), (the vector of weights).
+  std::vector<std::pair<int32_t, std::vector<float>>> bins_;
+
+  // TODO(fangjun): Remove debug_ and htk_mode_
+  bool debug_;
+  bool htk_mode_;
+};
+
+}  // namespace knf
+
+#endif  // KALDI_NATIVE_FBANK_CSRC_MEL_COMPUTATIONS_H_

+ 166 - 0
ggml/examples/kaldi-native-fbank/csrc/online-feature.cc

@@ -0,0 +1,166 @@
+/**
+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// The content in this file is copied/modified from
+// This file is copied/modified from kaldi/src/feat/online-feature.cc
+
+#include "online-feature.h"
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "feature-window.h"
+#include "log.h"
+
+namespace knf {
+
+RecyclingVector::RecyclingVector(int32_t items_to_hold)
+    : items_to_hold_(items_to_hold == 0 ? -1 : items_to_hold),
+      first_available_index_(0) {}
+
+const float *RecyclingVector::At(int32_t index) const {
+  if (index < first_available_index_) {
+    KNF_LOG(FATAL) << "Attempted to retrieve feature vector that was "
+                      "already removed by the RecyclingVector (index = "
+                   << index << "; "
+                   << "first_available_index = " << first_available_index_
+                   << "; "
+                   << "size = " << Size() << ")";
+  }
+  // 'at' does size checking.
+  return items_.at(index - first_available_index_).data();
+}
+
+void RecyclingVector::PushBack(std::vector<float> item) {
+  // Note: -1 is a larger number when treated as unsigned
+  if (items_.size() == static_cast<size_t>(items_to_hold_)) {
+    items_.pop_front();
+    ++first_available_index_;
+  }
+  items_.push_back(std::move(item));
+}
+
+int32_t RecyclingVector::Size() const {
+  return first_available_index_ + static_cast<int32_t>(items_.size());
+}
+
+// discard the first n frames
+void RecyclingVector::Pop(int32_t n) {
+  for (int32_t i = 0; i < n && !items_.empty(); ++i) {
+    items_.pop_front();
+    ++first_available_index_;
+  }
+}
+
+template <class C>
+OnlineGenericBaseFeature<C>::OnlineGenericBaseFeature(
+    const typename C::Options &opts)
+    : computer_(opts),
+      window_function_(computer_.GetFrameOptions()),
+      input_finished_(false),
+      waveform_offset_(0) {}
+
+template <class C>
+void OnlineGenericBaseFeature<C>::AcceptWaveform(float sampling_rate,
+                                                 const float *waveform,
+                                                 int32_t n) {
+  if (n == 0) {
+    return;  // Nothing to do.
+  }
+
+  if (input_finished_) {
+    KNF_LOG(FATAL) << "AcceptWaveform called after InputFinished() was called.";
+  }
+
+  KNF_CHECK_EQ(sampling_rate, computer_.GetFrameOptions().samp_freq);
+
+  waveform_remainder_.insert(waveform_remainder_.end(), waveform, waveform + n);
+
+  ComputeFeatures();
+}
+
+template <class C>
+void OnlineGenericBaseFeature<C>::InputFinished() {
+  input_finished_ = true;
+  ComputeFeatures();
+}
+
+template <class C>
+void OnlineGenericBaseFeature<C>::ComputeFeatures() {
+  const FrameExtractionOptions &frame_opts = computer_.GetFrameOptions();
+
+  int64_t num_samples_total = waveform_offset_ + waveform_remainder_.size();
+
+  int32_t num_frames_old = features_.Size();
+
+  int32_t num_frames_new =
+      NumFrames(num_samples_total, frame_opts, input_finished_);
+
+  KNF_CHECK_GE(num_frames_new, num_frames_old);
+
+  // note: this online feature-extraction code does not support VTLN.
+  float vtln_warp = 1.0;
+
+  std::vector<float> window;
+  bool need_raw_log_energy = computer_.NeedRawLogEnergy();
+
+  for (int32_t frame = num_frames_old; frame < num_frames_new; ++frame) {
+    std::fill(window.begin(), window.end(), 0);
+    float raw_log_energy = 0.0;
+    ExtractWindow(waveform_offset_, waveform_remainder_.data(), waveform_remainder_.size(),
+                  frame, frame_opts, window_function_, &window,
+                  need_raw_log_energy ? &raw_log_energy : nullptr);
+
+    std::vector<float> this_feature(computer_.Dim());
+
+    computer_.Compute(raw_log_energy, vtln_warp, &window, this_feature.data());
+    features_.PushBack(std::move(this_feature));
+  }
+
+  // OK, we will now discard any portion of the signal that will not be
+  // necessary to compute frames in the future.
+  int64_t first_sample_of_next_frame =
+      FirstSampleOfFrame(num_frames_new, frame_opts);
+
+  int32_t samples_to_discard = first_sample_of_next_frame - waveform_offset_;
+
+  if (samples_to_discard > 0) {
+    // discard the leftmost part of the waveform that we no longer need.
+    int32_t new_num_samples =
+        static_cast<int32_t>(waveform_remainder_.size()) - samples_to_discard;
+
+    if (new_num_samples <= 0) {
+      // odd, but we'll try to handle it.
+      waveform_offset_ += waveform_remainder_.size();
+      waveform_remainder_.resize(0);
+    } else {
+      std::vector<float> new_remainder(new_num_samples);
+
+      std::copy(waveform_remainder_.begin() + samples_to_discard,
+                waveform_remainder_.end(), new_remainder.begin());
+      waveform_offset_ += samples_to_discard;
+
+      waveform_remainder_.swap(new_remainder);
+    }
+  }
+}
+
+template class OnlineGenericBaseFeature<FbankComputer>;
+
+}  // namespace knf

+ 148 - 0
ggml/examples/kaldi-native-fbank/csrc/online-feature.h

@@ -0,0 +1,148 @@
+/**
+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// The content in this file is copied/modified from
+// This file is copied/modified from kaldi/src/feat/online-feature.h
+#ifndef KALDI_NATIVE_FBANK_CSRC_ONLINE_FEATURE_H_
+#define KALDI_NATIVE_FBANK_CSRC_ONLINE_FEATURE_H_
+
+#include <cstdint>
+#include <deque>
+#include <vector>
+
+#include "feature-fbank.h"
+
+namespace knf {
+
+/// This class serves as a storage for feature vectors with an option to limit
+/// the memory usage by removing old elements. The deleted frames indices are
+/// "remembered" so that regardless of the MAX_ITEMS setting, the user always
+/// provides the indices as if no deletion was being performed.
+/// This is useful when processing very long recordings which would otherwise
+/// cause the memory to eventually blow up when the features are not being
+/// removed.
+class RecyclingVector {
+ public:
+  /// By default it does not remove any elements.
+  explicit RecyclingVector(int32_t items_to_hold = -1);
+
+  ~RecyclingVector() = default;
+  RecyclingVector(const RecyclingVector &) = delete;
+  RecyclingVector &operator=(const RecyclingVector &) = delete;
+
+  // The pointer is owned by RecyclingVector
+  // Users should not free it
+  const float *At(int32_t index) const;
+
+  void PushBack(std::vector<float> item);
+
+  /// This method returns the size as if no "recycling" had happened,
+  /// i.e. equivalent to the number of times the PushBack method has been
+  /// called.
+  int32_t Size() const;
+
+  // discard the first n frames
+  void Pop(int32_t n);
+
+ private:
+  std::deque<std::vector<float>> items_;
+  int32_t items_to_hold_;
+  int32_t first_available_index_;
+};
+
+/// This is a templated class for online feature extraction;
+/// it's templated on a class like MfccComputer or PlpComputer
+/// that does the basic feature extraction.
+template <class C>
+class OnlineGenericBaseFeature {
+ public:
+  // Constructor from options class
+  explicit OnlineGenericBaseFeature(const typename C::Options &opts);
+
+  int32_t Dim() const { return computer_.Dim(); }
+
+  float FrameShiftInSeconds() const {
+    return computer_.GetFrameOptions().frame_shift_ms / 1000.0f;
+  }
+
+  int32_t NumFramesReady() const { return features_.Size(); }
+
+  // Note: IsLastFrame() will only ever return true if you have called
+  // InputFinished() (and this frame is the last frame).
+  bool IsLastFrame(int32_t frame) const {
+    return input_finished_ && frame == NumFramesReady() - 1;
+  }
+
+  const float *GetFrame(int32_t frame) const { return features_.At(frame); }
+
+  // This would be called from the application, when you get
+  // more wave data.  Note: the sampling_rate is only provided so
+  // the code can assert that it matches the sampling rate
+  // expected in the options.
+  //
+  // @param sampling_rate The sampling_rate of the input waveform
+  // @param waveform Pointer to a 1-D array of size n
+  // @param n Number of entries in waveform
+  void AcceptWaveform(float sampling_rate, const float *waveform, int32_t n);
+
+  // InputFinished() tells the class you won't be providing any
+  // more waveform.  This will help flush out the last frame or two
+  // of features, in the case where snip-edges == false; it also
+  // affects the return value of IsLastFrame().
+  void InputFinished();
+
+  // discard the first n frames
+  void Pop(int32_t n) { features_.Pop(n); }
+
+ private:
+  // This function computes any additional feature frames that it is possible to
+  // compute from 'waveform_remainder_', which at this point may contain more
+  // than just a remainder-sized quantity (because AcceptWaveform() appends to
+  // waveform_remainder_ before calling this function).  It adds these feature
+  // frames to features_, and shifts off any now-unneeded samples of input from
+  // waveform_remainder_ while incrementing waveform_offset_ by the same amount.
+  void ComputeFeatures();
+
+  C computer_;  // class that does the MFCC or PLP or filterbank computation
+
+  FeatureWindowFunction window_function_;
+
+  // features_ is the Mfcc or Plp or Fbank features that we have already
+  // computed.
+
+  RecyclingVector features_;
+
+  // True if the user has called "InputFinished()"
+  bool input_finished_;
+
+  // waveform_offset_ is the number of samples of waveform that we have
+  // already discarded, i.e. that were prior to 'waveform_remainder_'.
+  int64_t waveform_offset_;
+
+  // waveform_remainder_ is a short piece of waveform that we may need to keep
+  // after extracting all the whole frames we can (whatever length of feature
+  // will be required for the next phase of computation).
+  // It is a 1-D tensor
+  std::vector<float> waveform_remainder_;
+};
+
+using OnlineFbank = OnlineGenericBaseFeature<FbankComputer>;
+
+}  // namespace knf
+
+#endif  // KALDI_NATIVE_FBANK_CSRC_ONLINE_FEATURE_H_

+ 67 - 0
ggml/examples/kaldi-native-fbank/csrc/rfft.cc

@@ -0,0 +1,67 @@
+/**
+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "rfft.h"
+
+#include <algorithm>
+#include <cmath>
+#include <vector>
+
+#include "log.h"
+
+// see fftsg.c
+#ifdef __cplusplus
+extern "C" void rdft(int n, int isgn, double *a, int *ip, double *w);
+#else
+void rdft(int n, int isgn, double *a, int *ip, double *w);
+#endif
+
+namespace knf {
+class Rfft::RfftImpl {
+ public:
+  explicit RfftImpl(int32_t n) : n_(n), ip_(2 + std::sqrt(n / 2)), w_(n / 2) {
+    KNF_CHECK_EQ(n & (n - 1), 0);
+  }
+
+  void Compute(float *in_out) {
+    std::vector<double> d(in_out, in_out + n_);
+
+    Compute(d.data());
+
+    std::copy(d.begin(), d.end(), in_out);
+  }
+
+  void Compute(double *in_out) {
+    // 1 means forward fft
+    rdft(n_, 1, in_out, ip_.data(), w_.data());
+  }
+
+ private:
+  int32_t n_;
+  std::vector<int32_t> ip_;
+  std::vector<double> w_;
+};
+
+Rfft::Rfft(int32_t n) : impl_(std::make_unique<RfftImpl>(n)) {}
+
+Rfft::~Rfft() = default;
+
+void Rfft::Compute(float *in_out) { impl_->Compute(in_out); }
+void Rfft::Compute(double *in_out) { impl_->Compute(in_out); }
+
+}  // namespace knf

+ 56 - 0
ggml/examples/kaldi-native-fbank/csrc/rfft.h

@@ -0,0 +1,56 @@
+/**
+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef KALDI_NATIVE_FBANK_CSRC_RFFT_H_
+#define KALDI_NATIVE_FBANK_CSRC_RFFT_H_
+
+#include <memory>
+
+namespace knf {
+
+// n-point Real discrete Fourier transform
+// where n is a power of 2. n >= 2
+//
+//  R[k] = sum_j=0^n-1 in[j]*cos(2*pi*j*k/n), 0<=k<=n/2
+//  I[k] = sum_j=0^n-1 in[j]*sin(2*pi*j*k/n), 0<k<n/2
+class Rfft {
+ public:
+  // @param n Number of fft bins. it should be a power of 2.
+  explicit Rfft(int32_t n);
+  ~Rfft();
+
+  /** @param in_out A 1-D array of size n.
+   *             On return:
+   *               in_out[0] = R[0]
+   *               in_out[1] = R[n/2]
+   *               for 1 < k < n/2,
+   *                 in_out[2*k] = R[k]
+   *                 in_out[2*k+1] = I[k]
+   *
+   */
+  void Compute(float *in_out);
+  void Compute(double *in_out);
+
+ private:
+  class RfftImpl;
+  std::unique_ptr<RfftImpl> impl_;
+};
+
+}  // namespace knf
+
+#endif  // KALDI_NATIVE_FBANK_CSRC_RFFT_H_

+ 73 - 0
ggml/examples/kaldi-native-fbank/csrc/test-log.cc

@@ -0,0 +1,73 @@
+/**
+ * Copyright      2022  Xiaomi Corporation (authors: Fangjun Kuang)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "gtest/gtest.h"
+#include "log.h"
+
+namespace knf {
+
+#if KNF_ENABLE_CHECK
+
+TEST(Log, TestLog) {
+  KNF_LOG(TRACE) << "this is a trace message";
+  KNF_LOG(DEBUG) << "this is a debug message";
+  KNF_LOG(INFO) << "this is an info message";
+  KNF_LOG(WARNING) << "this is a warning message";
+  KNF_LOG(ERROR) << "this is an error message";
+
+  ASSERT_THROW(KNF_LOG(FATAL) << "This will crash the program",
+               std::runtime_error);
+
+  // For debug build
+
+  KNF_DLOG(TRACE) << "this is a trace message for debug build";
+  KNF_DLOG(DEBUG) << "this is a trace message for debug build";
+  KNF_DLOG(INFO) << "this is a trace message for debug build";
+  KNF_DLOG(ERROR) << "this is an error message for debug build";
+  KNF_DLOG(WARNING) << "this is a trace message for debug build";
+
+#if !defined(NDEBUG)
+  ASSERT_THROW(KNF_DLOG(FATAL) << "this is a trace message for debug build",
+               std::runtime_error);
+#endif
+}
+
+TEST(Log, TestCheck) {
+  KNF_CHECK_EQ(1, 1) << "ok";
+  KNF_CHECK_LE(1, 3) << "ok";
+  KNF_CHECK_LT(1, 2) << "ok";
+  KNF_CHECK_GT(2, 1) << "ok";
+  KNF_CHECK_GE(2, 1) << "ok";
+
+  ASSERT_THROW(KNF_CHECK_EQ(2, 1) << "bad things happened", std::runtime_error);
+
+  // for debug build
+  KNF_DCHECK_EQ(1, 1) << "ok";
+  KNF_DCHECK_LE(1, 3) << "ok";
+  KNF_DCHECK_LT(1, 2) << "ok";
+  KNF_DCHECK_GT(2, 1) << "ok";
+  KNF_DCHECK_GE(2, 1) << "ok";
+
+#if !defined(NDEBUG)
+  ASSERT_THROW(KNF_CHECK_EQ(2, 1) << "bad things happened", std::runtime_error);
+#endif
+}
+
+#endif
+
+}  // namespace knf

+ 48 - 0
ggml/examples/kaldi-native-fbank/csrc/test-online-fbank.cc

@@ -0,0 +1,48 @@
+/**
+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+
+#include "online-feature.h"
+
+int main() {
+  knf::FbankOptions opts;
+  opts.frame_opts.dither = 0;
+  opts.mel_opts.num_bins = 10;
+
+  knf::OnlineFbank fbank(opts);
+  for (int32_t i = 0; i < 1600; ++i) {
+    float s = (i * i - i / 2) / 32767.;
+    fbank.AcceptWaveform(16000, &s, 1);
+  }
+
+  std::ostringstream os;
+
+  int32_t n = fbank.NumFramesReady();
+  for (int32_t i = 0; i != n; ++i) {
+    const float *frame = fbank.GetFrame(i);
+    for (int32_t k = 0; k != opts.mel_opts.num_bins; ++k) {
+      os << frame[k] << ", ";
+    }
+    os << "\n";
+  }
+
+  std::cout << os.str() << "\n";
+
+  return 0;
+}

+ 59 - 0
ggml/examples/kaldi-native-fbank/csrc/test-online-feature.cc

@@ -0,0 +1,59 @@
+/**
+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "gtest/gtest.h"
+#include "online-feature.h"
+namespace knf {
+
+// TEST(RecyclingVector, TestUnlimited) {
+//   RecyclingVector v(-1);
+//   constexpr int32_t N = 100;
+//   for (int32_t i = 0; i != N; ++i) {
+//     std::unique_ptr<float[]> p(new float[3]{i, i + 1, i + 2});
+//     v.PushBack(std::move(p));
+//   }
+//   ASSERT_EQ(v.Size(), N);
+
+//   for (int32_t i = 0; i != N; ++i) {
+//     const float *t = v.At(i);
+//     for (int32_t k = 0; k != 3; ++k) {
+//       EXPECT_EQ(t[k], (i + k));
+//     }
+//   }
+// }
+
+// TEST(RecyclingVector, Testlimited) {
+//   constexpr int32_t K = 3;
+//   constexpr int32_t N = 10;
+//   RecyclingVector v(K);
+//   for (int32_t i = 0; i != N; ++i) {
+//     std::unique_ptr<float[]> p(new float[3]{i, i + 1, i + 2});
+//     v.PushBack(std::move(p));
+//   }
+
+//   ASSERT_EQ(v.Size(), N);
+
+//   for (int32_t i = N - K; i != N; ++i) {
+//     const float *t = v.At(i);
+
+//     for (int32_t k = 0; k != 3; ++k) {
+//       EXPECT_EQ(t[k], (i + k));
+//     }
+//   }
+// }
+}  // namespace knf

+ 52 - 0
ggml/examples/kaldi-native-fbank/csrc/test-rfft.cc

@@ -0,0 +1,52 @@
+/**
+ * Copyright      2022  Xiaomi Corporation (authors: Fangjun Kuang)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "gtest/gtest.h"
+#include "rfft.h"
+
+namespace knf {
+
+#if 0
+>>> import torch
+>>> a = torch.tensor([1., -1, 3, 8, 20, 6, 0, 2])
+>>> torch.fft.rfft(a)
+tensor([ 39.0000+0.0000j, -28.1924-2.2929j,  18.0000+5.0000j,  -9.8076+3.7071j,
+          9.0000+0.0000j])
+#endif
+
+TEST(Rfft, TestRfft) {
+  knf::Rfft fft(8);
+  for (int32_t i = 0; i != 10; ++i) {
+    std::vector<float> d = {1, -1, 3, 8, 20, 6, 0, 2};
+    fft.Compute(d.data());
+
+    EXPECT_EQ(d[0], 39);
+    EXPECT_EQ(d[1], 9);
+
+    EXPECT_NEAR(d[2], -28.1924, 1e-3);
+    EXPECT_NEAR(-d[3], -2.2929, 1e-3);
+
+    EXPECT_NEAR(d[4], 18, 1e-3);
+    EXPECT_NEAR(-d[5], 5, 1e-3);
+
+    EXPECT_NEAR(d[6], -9.8076, 1e-3);
+    EXPECT_NEAR(-d[7], 3.7071, 1e-3);
+  }
+}
+
+}  // namespace knf

+ 2 - 0
ggml/examples/kaldi-native-fbank/python/CMakeLists.txt

@@ -0,0 +1,2 @@
+add_subdirectory(csrc)
+add_subdirectory(tests)

+ 28 - 0
ggml/examples/kaldi-native-fbank/python/csrc/CMakeLists.txt

@@ -0,0 +1,28 @@
+pybind11_add_module(_kaldi_native_fbank
+  feature-fbank.cc
+  feature-window.cc
+  kaldi-native-fbank.cc
+  mel-computations.cc
+  online-feature.cc
+  utils.cc
+)
+
+if(APPLE)
+  execute_process(
+    COMMAND "${PYTHON_EXECUTABLE}" -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())"
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+    OUTPUT_VARIABLE PYTHON_SITE_PACKAGE_DIR
+  )
+  message(STATUS "PYTHON_SITE_PACKAGE_DIR: ${PYTHON_SITE_PACKAGE_DIR}")
+  target_link_libraries(_kaldi_native_fbank PRIVATE "-Wl,-rpath,${PYTHON_SITE_PACKAGE_DIR}")
+endif()
+
+if(NOT WIN32)
+  target_link_libraries(_kaldi_native_fbank PRIVATE "-Wl,-rpath,${kaldi_native_fbank_rpath_origin}/kaldi_native_fbank/lib")
+endif()
+
+target_link_libraries(_kaldi_native_fbank PRIVATE kaldi-native-fbank-core)
+
+install(TARGETS _kaldi_native_fbank
+  DESTINATION ../
+)

+ 57 - 0
ggml/examples/kaldi-native-fbank/python/csrc/feature-fbank.cc

@@ -0,0 +1,57 @@
+/**
+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kaldi-native-fbank/python/csrc/feature-fbank.h"
+
+#include <memory>
+#include <string>
+
+#include "feature-fbank.h"
+#include "kaldi-native-fbank/python/csrc/utils.h"
+
+namespace knf {
+
+static void PybindFbankOptions(py::module &m) {  // NOLINT
+  using PyClass = FbankOptions;
+  py::class_<PyClass>(m, "FbankOptions")
+      .def(py::init<>())
+      .def_readwrite("frame_opts", &PyClass::frame_opts)
+      .def_readwrite("mel_opts", &PyClass::mel_opts)
+      .def_readwrite("use_energy", &PyClass::use_energy)
+      .def_readwrite("energy_floor", &PyClass::energy_floor)
+      .def_readwrite("raw_energy", &PyClass::raw_energy)
+      .def_readwrite("htk_compat", &PyClass::htk_compat)
+      .def_readwrite("use_log_fbank", &PyClass::use_log_fbank)
+      .def_readwrite("use_power", &PyClass::use_power)
+      .def("__str__",
+           [](const PyClass &self) -> std::string { return self.ToString(); })
+      .def("as_dict",
+           [](const PyClass &self) -> py::dict { return AsDict(self); })
+      .def_static(
+          "from_dict",
+          [](py::dict dict) -> PyClass { return FbankOptionsFromDict(dict); })
+      .def(py::pickle(
+          [](const PyClass &self) -> py::dict { return AsDict(self); },
+          [](py::dict dict) -> PyClass { return FbankOptionsFromDict(dict); }));
+}
+
+void PybindFeatureFbank(py::module &m) {  // NOLINT
+  PybindFbankOptions(m);
+}
+
+}  // namespace knf

+ 30 - 0
ggml/examples/kaldi-native-fbank/python/csrc/feature-fbank.h

@@ -0,0 +1,30 @@
+/**
+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef KALDI_NATIVE_FBANK_PYTHON_CSRC_FEATURE_FBANK_H_
+#define KALDI_NATIVE_FBANK_PYTHON_CSRC_FEATURE_FBANK_H_
+
+#include "kaldi-native-fbank/python/csrc/kaldi-native-fbank.h"
+
+namespace knf {
+
+void PybindFeatureFbank(py::module &m);  // NOLINT
+
+}  // namespace knf
+
+#endif  // KALDI_NATIVE_FBANK_PYTHON_CSRC_FEATURE_FBANK_H_

+ 66 - 0
ggml/examples/kaldi-native-fbank/python/csrc/feature-window.cc

@@ -0,0 +1,66 @@
+/**
+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kaldi-native-fbank/python/csrc/feature-window.h"
+
+#include <string>
+
+#include "feature-window.h"
+#include "kaldi-native-fbank/python/csrc/utils.h"
+
+namespace knf {
+
+static void PybindFrameExtractionOptions(py::module &m) {  // NOLINT
+  using PyClass = FrameExtractionOptions;
+  py::class_<PyClass>(m, "FrameExtractionOptions")
+      .def(py::init<>())
+      .def_readwrite("samp_freq", &PyClass::samp_freq)
+      .def_readwrite("frame_shift_ms", &PyClass::frame_shift_ms)
+      .def_readwrite("frame_length_ms", &PyClass::frame_length_ms)
+      .def_readwrite("dither", &PyClass::dither)
+      .def_readwrite("preemph_coeff", &PyClass::preemph_coeff)
+      .def_readwrite("remove_dc_offset", &PyClass::remove_dc_offset)
+      .def_readwrite("window_type", &PyClass::window_type)
+      .def_readwrite("round_to_power_of_two", &PyClass::round_to_power_of_two)
+      .def_readwrite("blackman_coeff", &PyClass::blackman_coeff)
+      .def_readwrite("snip_edges", &PyClass::snip_edges)
+      .def("as_dict",
+           [](const PyClass &self) -> py::dict { return AsDict(self); })
+      .def_static("from_dict",
+                  [](py::dict dict) -> PyClass {
+                    return FrameExtractionOptionsFromDict(dict);
+                  })
+#if 0
+      .def_readwrite("allow_downsample",
+                     &PyClass::allow_downsample)
+      .def_readwrite("allow_upsample", &PyClass::allow_upsample)
+#endif
+      .def("__str__",
+           [](const PyClass &self) -> std::string { return self.ToString(); })
+      .def(py::pickle(
+          [](const PyClass &self) -> py::dict { return AsDict(self); },
+          [](py::dict dict) -> PyClass {
+            return FrameExtractionOptionsFromDict(dict);
+          }));
+}
+
+void PybindFeatureWindow(py::module &m) {  // NOLINT
+  PybindFrameExtractionOptions(m);
+}
+
+}  // namespace knf

+ 30 - 0
ggml/examples/kaldi-native-fbank/python/csrc/feature-window.h

@@ -0,0 +1,30 @@
+/**
+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef KALDI_NATIVE_FBANK_PYTHON_CSRC_FEATURE_WINDOW_H_
+#define KALDI_NATIVE_FBANK_PYTHON_CSRC_FEATURE_WINDOW_H_
+
+#include "kaldi-native-fbank/python/csrc/kaldi-native-fbank.h"
+
+namespace knf {
+
+void PybindFeatureWindow(py::module &m);  // NOLINT
+
+}  // namespace knf
+
+#endif  // KALDI_NATIVE_FBANK_PYTHON_CSRC_FEATURE_WINDOW_H_

+ 37 - 0
ggml/examples/kaldi-native-fbank/python/csrc/kaldi-native-fbank.cc

@@ -0,0 +1,37 @@
+/**
+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kaldi-native-fbank/python/csrc/kaldi-native-fbank.h"
+
+#include "kaldi-native-fbank/python/csrc/feature-fbank.h"
+#include "kaldi-native-fbank/python/csrc/feature-window.h"
+#include "kaldi-native-fbank/python/csrc/mel-computations.h"
+#include "kaldi-native-fbank/python/csrc/online-feature.h"
+
+namespace knf {
+
+PYBIND11_MODULE(_kaldi_native_fbank, m) {
+  m.doc() = "Python wrapper for kaldi native fbank";
+  PybindFeatureWindow(m);
+  PybindMelComputations(m);
+  PybindFeatureFbank(m);
+
+  PybindOnlineFeature(m);
+}
+
+}  // namespace knf

+ 27 - 0
ggml/examples/kaldi-native-fbank/python/csrc/kaldi-native-fbank.h

@@ -0,0 +1,27 @@
+/**
+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef KALDI_NATIVE_FBANK_PYTHON_CSRC_KALDI_NATIVE_FBANK_H_
+#define KALDI_NATIVE_FBANK_PYTHON_CSRC_KALDI_NATIVE_FBANK_H_
+
+#include "pybind11/numpy.h"
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+namespace py = pybind11;
+
+#endif  // KALDI_NATIVE_FBANK_PYTHON_CSRC_KALDI_NATIVE_FBANK_H_

+ 58 - 0
ggml/examples/kaldi-native-fbank/python/csrc/mel-computations.cc

@@ -0,0 +1,58 @@
+/**
+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kaldi-native-fbank/python/csrc/mel-computations.h"
+
+#include <string>
+
+#include "mel-computations.h"
+#include "kaldi-native-fbank/python/csrc/utils.h"
+
+namespace knf {
+
+static void PybindMelBanksOptions(py::module &m) {  // NOLINT
+  using PyClass = MelBanksOptions;
+  py::class_<PyClass>(m, "MelBanksOptions")
+      .def(py::init<>())
+      .def_readwrite("num_bins", &PyClass::num_bins)
+      .def_readwrite("low_freq", &PyClass::low_freq)
+      .def_readwrite("high_freq", &PyClass::high_freq)
+      .def_readwrite("vtln_low", &PyClass::vtln_low)
+      .def_readwrite("vtln_high", &PyClass::vtln_high)
+      .def_readwrite("debug_mel", &PyClass::debug_mel)
+      .def_readwrite("htk_mode", &PyClass::htk_mode)
+      .def("__str__",
+           [](const PyClass &self) -> std::string { return self.ToString(); })
+      .def("as_dict",
+           [](const PyClass &self) -> py::dict { return AsDict(self); })
+      .def_static("from_dict",
+                  [](py::dict dict) -> PyClass {
+                    return MelBanksOptionsFromDict(dict);
+                  })
+      .def(py::pickle(
+          [](const PyClass &self) -> py::dict { return AsDict(self); },
+          [](py::dict dict) -> PyClass {
+            return MelBanksOptionsFromDict(dict);
+          }));
+}
+
+void PybindMelComputations(py::module &m) {  // NOLINT
+  PybindMelBanksOptions(m);
+}
+
+}  // namespace knf

+ 30 - 0
ggml/examples/kaldi-native-fbank/python/csrc/mel-computations.h

@@ -0,0 +1,30 @@
+/**
+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef KALDI_NATIVE_FBANK_PYTHON_CSRC_MEL_COMPUTATIONS_H_
+#define KALDI_NATIVE_FBANK_PYTHON_CSRC_MEL_COMPUTATIONS_H_
+
+#include "kaldi-native-fbank/python/csrc/kaldi-native-fbank.h"
+
+namespace knf {
+
+void PybindMelComputations(py::module &m);  // NOLINT
+
+}  // namespace knf
+
+#endif  // KALDI_NATIVE_FBANK_PYTHON_CSRC_MEL_COMPUTATIONS_H_

+ 68 - 0
ggml/examples/kaldi-native-fbank/python/csrc/online-feature.cc

@@ -0,0 +1,68 @@
+/**
+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kaldi-native-fbank/python/csrc/online-feature.h"
+
+#include <string>
+#include <vector>
+
+#include "online-feature.h"
+namespace knf {
+
+template <typename C>
+void PybindOnlineFeatureTpl(py::module &m,  // NOLINT
+                            const std::string &class_name,
+                            const std::string &class_help_doc = "") {
+  using PyClass = OnlineGenericBaseFeature<C>;
+  using Options = typename C::Options;
+  py::class_<PyClass>(m, class_name.c_str(), class_help_doc.c_str())
+      .def(py::init<const Options &>(), py::arg("opts"))
+      .def_property_readonly("dim", &PyClass::Dim)
+      .def_property_readonly("frame_shift_in_seconds",
+                             &PyClass::FrameShiftInSeconds)
+      .def_property_readonly("num_frames_ready", &PyClass::NumFramesReady)
+      .def("is_last_frame", &PyClass::IsLastFrame, py::arg("frame"))
+      .def(
+          "get_frame",
+          [](py::object obj, int32_t frame) {
+            auto *self = obj.cast<PyClass *>();
+            const float *f = self->GetFrame(frame);
+            return py::array_t<float>({self->Dim()},    // shape
+                                      {sizeof(float)},  // stride in bytes
+                                      f,                // ptr
+                                      obj);  // it will increase the reference
+                                             // count of **this** vector
+          },
+          py::arg("frame"))
+      .def(
+          "accept_waveform",
+          [](PyClass &self, float sampling_rate,
+             const std::vector<float> &waveform) {
+            self.AcceptWaveform(sampling_rate, waveform.data(),
+                                waveform.size());
+          },
+          py::arg("sampling_rate"), py::arg("waveform"),
+          py::call_guard<py::gil_scoped_release>())
+      .def("input_finished", &PyClass::InputFinished);
+}
+
+void PybindOnlineFeature(py::module &m) {  // NOLINT
+  PybindOnlineFeatureTpl<FbankComputer>(m, "OnlineFbank");
+}
+
+}  // namespace knf

+ 30 - 0
ggml/examples/kaldi-native-fbank/python/csrc/online-feature.h

@@ -0,0 +1,30 @@
+/**
+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef KALDI_NATIVE_FBANK_PYTHON_CSRC_ONLINE_FEATURE_H_
+#define KALDI_NATIVE_FBANK_PYTHON_CSRC_ONLINE_FEATURE_H_
+
+#include "kaldi-native-fbank/python/csrc/kaldi-native-fbank.h"
+
+namespace knf {
+
+void PybindOnlineFeature(py::module &m);  // NOLINT
+
+}  // namespace knf
+
+#endif  // KALDI_NATIVE_FBANK_PYTHON_CSRC_ONLINE_FEATURE_H_

+ 134 - 0
ggml/examples/kaldi-native-fbank/python/csrc/utils.cc

@@ -0,0 +1,134 @@
+/**
+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kaldi-native-fbank/python/csrc/utils.h"
+
+#include <string>
+
+#include "feature-window.h"
+
+#define FROM_DICT(type, key)         \
+  if (dict.contains(#key)) {         \
+    opts.key = py::type(dict[#key]); \
+  }
+
+#define AS_DICT(key) dict[#key] = opts.key
+
+namespace knf {
+
+FrameExtractionOptions FrameExtractionOptionsFromDict(py::dict dict) {
+  FrameExtractionOptions opts;
+
+  FROM_DICT(float_, samp_freq);
+  FROM_DICT(float_, frame_shift_ms);
+  FROM_DICT(float_, frame_length_ms);
+  FROM_DICT(float_, dither);
+  FROM_DICT(float_, preemph_coeff);
+  FROM_DICT(bool_, remove_dc_offset);
+  FROM_DICT(str, window_type);
+  FROM_DICT(bool_, round_to_power_of_two);
+  FROM_DICT(float_, blackman_coeff);
+  FROM_DICT(bool_, snip_edges);
+
+  return opts;
+}
+
+py::dict AsDict(const FrameExtractionOptions &opts) {
+  py::dict dict;
+
+  AS_DICT(samp_freq);
+  AS_DICT(frame_shift_ms);
+  AS_DICT(frame_length_ms);
+  AS_DICT(dither);
+  AS_DICT(preemph_coeff);
+  AS_DICT(remove_dc_offset);
+  AS_DICT(window_type);
+  AS_DICT(round_to_power_of_two);
+  AS_DICT(blackman_coeff);
+  AS_DICT(snip_edges);
+
+  return dict;
+}
+
+MelBanksOptions MelBanksOptionsFromDict(py::dict dict) {
+  MelBanksOptions opts;
+
+  FROM_DICT(int_, num_bins);
+  FROM_DICT(float_, low_freq);
+  FROM_DICT(float_, high_freq);
+  FROM_DICT(float_, vtln_low);
+  FROM_DICT(float_, vtln_high);
+  FROM_DICT(bool_, debug_mel);
+  FROM_DICT(bool_, htk_mode);
+
+  return opts;
+}
+py::dict AsDict(const MelBanksOptions &opts) {
+  py::dict dict;
+
+  AS_DICT(num_bins);
+  AS_DICT(low_freq);
+  AS_DICT(high_freq);
+  AS_DICT(vtln_low);
+  AS_DICT(vtln_high);
+  AS_DICT(debug_mel);
+  AS_DICT(htk_mode);
+
+  return dict;
+}
+
+FbankOptions FbankOptionsFromDict(py::dict dict) {
+  FbankOptions opts;
+
+  if (dict.contains("frame_opts")) {
+    opts.frame_opts = FrameExtractionOptionsFromDict(dict["frame_opts"]);
+  }
+
+  if (dict.contains("mel_opts")) {
+    opts.mel_opts = MelBanksOptionsFromDict(dict["mel_opts"]);
+  }
+
+  FROM_DICT(bool_, use_energy);
+  FROM_DICT(float_, energy_floor);
+  FROM_DICT(bool_, raw_energy);
+  FROM_DICT(bool_, htk_compat);
+  FROM_DICT(bool_, use_log_fbank);
+  FROM_DICT(bool_, use_power);
+
+  return opts;
+}
+
+py::dict AsDict(const FbankOptions &opts) {
+  py::dict dict;
+
+  dict["frame_opts"] = AsDict(opts.frame_opts);
+  dict["mel_opts"] = AsDict(opts.mel_opts);
+  AS_DICT(use_energy);
+  AS_DICT(energy_floor);
+  AS_DICT(raw_energy);
+  AS_DICT(htk_compat);
+  AS_DICT(use_log_fbank);
+  AS_DICT(use_power);
+
+  return dict;
+}
+
+#undef FROM_DICT
+#undef AS_DICT
+
+}  // namespace knf

+ 52 - 0
ggml/examples/kaldi-native-fbank/python/csrc/utils.h

@@ -0,0 +1,52 @@
+/**
+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef KALDI_NATIVE_FBANK_PYTHON_CSRC_UTILS_H_
+#define KALDI_NATIVE_FBANK_PYTHON_CSRC_UTILS_H_
+
+#include "feature-fbank.h"
+#include "feature-window.h"
+#include "mel-computations.h"
+#include "kaldi-native-fbank/python/csrc/kaldi-native-fbank.h"
+
+/*
+ * This file contains code about `from_dict` and
+ * `as_dict` for various options in kaldi-native-fbank.
+ *
+ * Regarding `from_dict`, users don't need to provide
+ * all the fields in the options. If some fields
+ * are not provided, it just uses the default one.
+ *
+ * If the provided dict in `from_dict` is empty,
+ * all fields use their default values.
+ */
+
+namespace knf {
+
+FrameExtractionOptions FrameExtractionOptionsFromDict(py::dict dict);
+py::dict AsDict(const FrameExtractionOptions &opts);
+
+MelBanksOptions MelBanksOptionsFromDict(py::dict dict);
+py::dict AsDict(const MelBanksOptions &opts);
+
+FbankOptions FbankOptionsFromDict(py::dict dict);
+py::dict AsDict(const FbankOptions &opts);
+
+}  // namespace knf
+
+#endif  // KALDI_NATIVE_FBANK_PYTHON_CSRC_UTILS_H_

+ 6 - 0
ggml/examples/kaldi-native-fbank/python/kaldi_native_fbank/__init__.py

@@ -0,0 +1,6 @@
+from _kaldi_native_fbank import (
+    FrameExtractionOptions,
+    MelBanksOptions,
+    OnlineFbank,
+    FbankOptions,
+)

+ 31 - 0
ggml/examples/kaldi-native-fbank/python/tests/CMakeLists.txt

@@ -0,0 +1,31 @@
+function(kaldi_native_fbank_add_py_test source)
+  get_filename_component(name ${source} NAME_WE)
+  set(name "${name}_py")
+
+    message(STATUS "source: ${source}")
+
+  add_test(NAME ${name}
+    COMMAND
+      "${PYTHON_EXECUTABLE}"
+      "${CMAKE_CURRENT_SOURCE_DIR}/${source}"
+  )
+
+  get_filename_component(kaldi_native_fbank_path ${CMAKE_CURRENT_LIST_DIR} DIRECTORY)
+
+  set_property(TEST ${name}
+    PROPERTY ENVIRONMENT "PYTHONPATH=${kaldi_native_fbank_path}:$<TARGET_FILE_DIR:_kaldi_native_fbank>:$ENV{PYTHONPATH}"
+  )
+endfunction()
+
+# please sort the files in alphabetic order
+set(py_test_files
+  test_frame_extraction_options.py
+  test_mel_bank_options.py
+  test_fbank_options.py
+)
+
+if(KALDI_NATIVE_FBANK_BUILD_TESTS)
+  foreach(source IN LISTS py_test_files)
+    kaldi_native_fbank_add_py_test(${source})
+  endforeach()
+endif()

+ 198 - 0
ggml/examples/kaldi-native-fbank/python/tests/test_fbank_options.py

@@ -0,0 +1,198 @@
+#!/usr/bin/env python3
+#
+# Copyright (c)  2021  Xiaomi Corporation (authors: Fangjun Kuang)
+
+
+import pickle
+
+import kaldi_native_fbank as knf
+
+
+def test_default():
+    opts = knf.FbankOptions()
+    assert opts.frame_opts.samp_freq == 16000
+    assert opts.frame_opts.frame_shift_ms == 10.0
+    assert opts.frame_opts.frame_length_ms == 25.0
+    assert opts.frame_opts.dither == 1.0
+    assert abs(opts.frame_opts.preemph_coeff - 0.97) < 1e-6
+    assert opts.frame_opts.remove_dc_offset is True
+    assert opts.frame_opts.window_type == "povey"
+    assert opts.frame_opts.round_to_power_of_two is True
+    assert abs(opts.frame_opts.blackman_coeff - 0.42) < 1e-6
+    assert opts.frame_opts.snip_edges is True
+
+    assert opts.mel_opts.num_bins == 23
+    assert opts.mel_opts.low_freq == 20
+    assert opts.mel_opts.high_freq == 0
+    assert opts.mel_opts.vtln_low == 100
+    assert opts.mel_opts.vtln_high == -500
+    assert opts.mel_opts.debug_mel is False
+    assert opts.mel_opts.htk_mode is False
+
+    assert opts.use_energy is False
+    assert opts.energy_floor == 0.0
+    assert opts.raw_energy is True
+    assert opts.htk_compat is False
+    assert opts.use_log_fbank is True
+    assert opts.use_power is True
+
+
+def test_set_get():
+    opts = knf.FbankOptions()
+    opts.use_energy = True
+    assert opts.use_energy is True
+
+    opts.energy_floor = 1
+    assert opts.energy_floor == 1
+
+    opts.raw_energy = False
+    assert opts.raw_energy is False
+
+    opts.htk_compat = True
+    assert opts.htk_compat is True
+
+    opts.use_log_fbank = False
+    assert opts.use_log_fbank is False
+
+    opts.use_power = False
+    assert opts.use_power is False
+
+
+def test_set_get_frame_opts():
+    opts = knf.FbankOptions()
+
+    opts.frame_opts.samp_freq = 44100
+    assert opts.frame_opts.samp_freq == 44100
+
+    opts.frame_opts.frame_shift_ms = 20.5
+    assert opts.frame_opts.frame_shift_ms == 20.5
+
+    opts.frame_opts.frame_length_ms = 1
+    assert opts.frame_opts.frame_length_ms == 1
+
+    opts.frame_opts.dither = 0.5
+    assert opts.frame_opts.dither == 0.5
+
+    opts.frame_opts.preemph_coeff = 0.25
+    assert opts.frame_opts.preemph_coeff == 0.25
+
+    opts.frame_opts.remove_dc_offset = False
+    assert opts.frame_opts.remove_dc_offset is False
+
+    opts.frame_opts.window_type = "hanning"
+    assert opts.frame_opts.window_type == "hanning"
+
+    opts.frame_opts.round_to_power_of_two = False
+    assert opts.frame_opts.round_to_power_of_two is False
+
+    opts.frame_opts.blackman_coeff = 0.25
+    assert opts.frame_opts.blackman_coeff == 0.25
+
+    opts.frame_opts.snip_edges = False
+    assert opts.frame_opts.snip_edges is False
+
+
+def test_set_get_mel_opts():
+    opts = knf.FbankOptions()
+
+    opts.mel_opts.num_bins = 100
+    assert opts.mel_opts.num_bins == 100
+
+    opts.mel_opts.low_freq = 22
+    assert opts.mel_opts.low_freq == 22
+
+    opts.mel_opts.high_freq = 1
+    assert opts.mel_opts.high_freq == 1
+
+    opts.mel_opts.vtln_low = 101
+    assert opts.mel_opts.vtln_low == 101
+
+    opts.mel_opts.vtln_high = -100
+    assert opts.mel_opts.vtln_high == -100
+
+    opts.mel_opts.debug_mel = True
+    assert opts.mel_opts.debug_mel is True
+
+    opts.mel_opts.htk_mode = True
+    assert opts.mel_opts.htk_mode is True
+
+
+def test_from_empty_dict():
+    opts = knf.FbankOptions.from_dict({})
+    opts2 = knf.FbankOptions()
+
+    assert str(opts) == str(opts2)
+
+
+def test_from_dict_partial():
+    d = {
+        "energy_floor": 10.5,
+        "htk_compat": True,
+        "mel_opts": {"num_bins": 80, "vtln_low": 1},
+        "frame_opts": {"window_type": "hanning"},
+    }
+    opts = knf.FbankOptions.from_dict(d)
+    assert opts.energy_floor == 10.5
+    assert opts.htk_compat is True
+    assert opts.mel_opts.num_bins == 80
+    assert opts.mel_opts.vtln_low == 1
+    assert opts.frame_opts.window_type == "hanning"
+
+    mel_opts = knf.MelBanksOptions.from_dict(d["mel_opts"])
+    assert str(opts.mel_opts) == str(mel_opts)
+
+
+def test_from_dict_full_and_as_dict():
+    opts = knf.FbankOptions()
+    opts.htk_compat = True
+    opts.mel_opts.num_bins = 80
+    opts.frame_opts.samp_freq = 10
+
+    d = opts.as_dict()
+    assert d["htk_compat"] is True
+    assert d["mel_opts"]["num_bins"] == 80
+    assert d["frame_opts"]["samp_freq"] == 10
+
+    mel_opts = knf.MelBanksOptions()
+    mel_opts.num_bins = 80
+    assert d["mel_opts"] == mel_opts.as_dict()
+
+    frame_opts = knf.FrameExtractionOptions()
+    frame_opts.samp_freq = 10
+    assert d["frame_opts"] == frame_opts.as_dict()
+
+    opts2 = knf.FbankOptions.from_dict(d)
+    assert str(opts2) == str(opts)
+
+    d["htk_compat"] = False
+    opts3 = knf.FbankOptions.from_dict(d)
+    assert opts3.htk_compat is False
+
+
+def test_pickle():
+    opts = knf.FbankOptions()
+    opts.use_energy = True
+    opts.use_power = False
+
+    opts.frame_opts.samp_freq = 44100
+    opts.mel_opts.num_bins = 100
+
+    data = pickle.dumps(opts)
+
+    opts2 = pickle.loads(data)
+    assert str(opts) == str(opts2)
+
+
+def main():
+    test_default()
+    test_set_get()
+    test_set_get_frame_opts()
+    test_set_get_mel_opts()
+    test_from_empty_dict()
+    test_from_dict_partial()
+    test_from_dict_full_and_as_dict()
+    test_pickle()
+
+
+if __name__ == "__main__":
+    main()

+ 119 - 0
ggml/examples/kaldi-native-fbank/python/tests/test_frame_extraction_options.py

@@ -0,0 +1,119 @@
+#!/usr/bin/env python3
+#
+# Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
+
+import pickle
+
+import kaldi_native_fbank as knf
+
+
+def test_default():
+    opts = knf.FrameExtractionOptions()
+    assert opts.samp_freq == 16000
+    assert opts.frame_shift_ms == 10.0
+    assert opts.frame_length_ms == 25.0
+    assert opts.dither == 1.0
+    assert abs(opts.preemph_coeff - 0.97) < 1e-6
+    assert opts.remove_dc_offset is True
+    assert opts.window_type == "povey"
+    assert opts.round_to_power_of_two is True
+    assert abs(opts.blackman_coeff - 0.42) < 1e-6
+    assert opts.snip_edges is True
+
+
+def test_set_get():
+    opts = knf.FrameExtractionOptions()
+    opts.samp_freq = 44100
+    assert opts.samp_freq == 44100
+
+    opts.frame_shift_ms = 20.5
+    assert opts.frame_shift_ms == 20.5
+
+    opts.frame_length_ms = 1
+    assert opts.frame_length_ms == 1
+
+    opts.dither = 0.5
+    assert opts.dither == 0.5
+
+    opts.preemph_coeff = 0.25
+    assert opts.preemph_coeff == 0.25
+
+    opts.remove_dc_offset = False
+    assert opts.remove_dc_offset is False
+
+    opts.window_type = "hanning"
+    assert opts.window_type == "hanning"
+
+    opts.round_to_power_of_two = False
+    assert opts.round_to_power_of_two is False
+
+    opts.blackman_coeff = 0.25
+    assert opts.blackman_coeff == 0.25
+
+    opts.snip_edges = False
+    assert opts.snip_edges is False
+
+
+def test_from_empty_dict():
+    opts = knf.FrameExtractionOptions.from_dict({})
+    opts2 = knf.FrameExtractionOptions()
+
+    assert str(opts) == str(opts2)
+
+
+def test_from_dict_partial():
+    d = {"samp_freq": 10, "frame_shift_ms": 2}
+
+    opts = knf.FrameExtractionOptions.from_dict(d)
+
+    opts2 = knf.FrameExtractionOptions()
+    assert str(opts) != str(opts2)
+
+    opts2.samp_freq = 10
+    assert str(opts) != str(opts2)
+
+    opts2.frame_shift_ms = 2
+    assert str(opts) == str(opts2)
+
+    opts2.frame_shift_ms = 3
+    assert str(opts) != str(opts2)
+
+
+def test_from_dict_full_and_as_dict():
+    opts = knf.FrameExtractionOptions()
+    opts.samp_freq = 20
+    opts.frame_length_ms = 100
+
+    d = opts.as_dict()
+    for key, value in d.items():
+        assert value == getattr(opts, key)
+
+    opts2 = knf.FrameExtractionOptions.from_dict(d)
+    assert str(opts2) == str(opts)
+
+    d["window_type"] = "hanning"
+    opts3 = knf.FrameExtractionOptions.from_dict(d)
+    assert opts3.window_type == "hanning"
+
+
+def test_pickle():
+    opts = knf.FrameExtractionOptions()
+    opts.samp_freq = 44100
+    opts.dither = 5.5
+    data = pickle.dumps(opts)
+
+    opts2 = pickle.loads(data)
+    assert str(opts) == str(opts2)
+
+
+def main():
+    test_default()
+    test_set_get()
+    test_from_empty_dict()
+    test_from_dict_partial()
+    test_from_dict_full_and_as_dict()
+    test_pickle()
+
+
+if __name__ == "__main__":
+    main()

+ 107 - 0
ggml/examples/kaldi-native-fbank/python/tests/test_mel_bank_options.py

@@ -0,0 +1,107 @@
+#!/usr/bin/env python3
+#
+# Copyright (c)  2021  Xiaomi Corporation (authors: Fangjun Kuang)
+
+import pickle
+
+import kaldi_native_fbank as knf
+
+
+def test_default():
+    opts = knf.MelBanksOptions()
+    assert opts.num_bins == 25
+    assert opts.low_freq == 20
+    assert opts.high_freq == 0
+    assert opts.vtln_low == 100
+    assert opts.vtln_high == -500
+    assert opts.debug_mel is False
+    assert opts.htk_mode is False
+
+
+def test_set_get():
+    opts = knf.MelBanksOptions()
+    opts.num_bins = 100
+    assert opts.num_bins == 100
+
+    opts.low_freq = 22
+    assert opts.low_freq == 22
+
+    opts.high_freq = 1
+    assert opts.high_freq == 1
+
+    opts.vtln_low = 101
+    assert opts.vtln_low == 101
+
+    opts.vtln_high = -100
+    assert opts.vtln_high == -100
+
+    opts.debug_mel = True
+    assert opts.debug_mel is True
+
+    opts.htk_mode = True
+    assert opts.htk_mode is True
+
+
+def test_from_empty_dict():
+    opts = knf.MelBanksOptions.from_dict({})
+    opts2 = knf.MelBanksOptions()
+
+    assert str(opts) == str(opts2)
+
+
+def test_from_dict_partial():
+    d = {"num_bins": 10, "debug_mel": True}
+
+    opts = knf.MelBanksOptions.from_dict(d)
+
+    opts2 = knf.MelBanksOptions()
+    assert str(opts) != str(opts2)
+
+    opts2.num_bins = 10
+    assert str(opts) != str(opts2)
+
+    opts2.debug_mel = True
+    assert str(opts) == str(opts2)
+
+    opts2.debug_mel = False
+    assert str(opts) != str(opts2)
+
+
+def test_from_dict_full_and_as_dict():
+    opts = knf.MelBanksOptions()
+    opts.num_bins = 80
+    opts.vtln_high = 2
+
+    d = opts.as_dict()
+    for key, value in d.items():
+        assert value == getattr(opts, key)
+
+    opts2 = knf.MelBanksOptions.from_dict(d)
+    assert str(opts2) == str(opts)
+
+    d["htk_mode"] = True
+    opts3 = knf.MelBanksOptions.from_dict(d)
+    assert opts3.htk_mode is True
+
+
+def test_pickle():
+    opts = knf.MelBanksOptions()
+    opts.num_bins = 100
+    opts.low_freq = 22
+    data = pickle.dumps(opts)
+
+    opts2 = pickle.loads(data)
+    assert str(opts) == str(opts2)
+
+
+def main():
+    test_default()
+    test_set_get()
+    test_from_empty_dict()
+    test_from_dict_partial()
+    test_from_dict_full_and_as_dict()
+    test_pickle()
+
+
+if __name__ == "__main__":
+    main()

+ 48 - 0
ggml/examples/kaldi-native-fbank/python/tests/test_online_fbank.py

@@ -0,0 +1,48 @@
+#!/usr/bin/env python3
+
+import sys
+
+try:
+    import kaldifeat
+except:
+    print("Please install kaldifeat first")
+    sys.exit(0)
+
+import kaldi_native_fbank as knf
+import torch
+
+
+def main():
+    sampling_rate = 16000
+    samples = torch.randn(16000 * 10)
+
+    opts = kaldifeat.FbankOptions()
+    opts.frame_opts.dither = 0
+    opts.mel_opts.num_bins = 80
+    opts.frame_opts.snip_edges = False
+    opts.mel_opts.debug_mel = False
+
+    online_fbank = kaldifeat.OnlineFbank(opts)
+
+    online_fbank.accept_waveform(sampling_rate, samples)
+
+    opts = knf.FbankOptions()
+    opts.frame_opts.dither = 0
+    opts.mel_opts.num_bins = 80
+    opts.frame_opts.snip_edges = False
+    opts.mel_opts.debug_mel = False
+
+    fbank = knf.OnlineFbank(opts)
+    fbank.accept_waveform(sampling_rate, samples.tolist())
+
+    assert online_fbank.num_frames_ready == fbank.num_frames_ready
+    for i in range(fbank.num_frames_ready):
+        f1 = online_fbank.get_frame(i)
+        f2 = torch.from_numpy(fbank.get_frame(i))
+        assert torch.allclose(f1, f2, atol=1e-3), (i, (f1 - f2).abs().max())
+
+
+if __name__ == "__main__":
+    torch.manual_seed(20220825)
+    main()
+    print("success")

+ 1 - 1
ggml/examples/unity/CMakeLists.txt

@@ -2,7 +2,7 @@
 
 add_executable(unity unity.cpp)
 target_include_directories(unity PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
-target_link_libraries(unity PRIVATE ggml common common-ggml)
+target_link_libraries(unity PRIVATE ggml common common-ggml kaldi-native-fbank)
 target_sources(unity
     PRIVATE
         fairseq2.cpp

+ 336 - 3
ggml/examples/unity/fairseq2.cpp

@@ -1,8 +1,11 @@
 #include <math.h>
+#include "kaldi-native-fbank/csrc/feature-fbank.h"
+#include "kaldi-native-fbank/csrc/feature-window.h"
 #include "ggml.h"
 #include "fairseq2.h"
 #include <unordered_map>
 #include <algorithm>
+#include <iostream>
 
 
 /// allocate the fairseq2 model and hyperparameters
@@ -13,14 +16,14 @@ extern "C" fairseq2_model* fairseq2_model_alloc() {
     model->arch = new std::uint64_t[16 * 1024];  // max tensors allowed
     model->tensors_ctx = nullptr;
     return model;
-};
+}
 
 extern "C" void fairseq2_model_free(fairseq2_model* model) {
     if (model->tensors_ctx) ggml_free(model->tensors_ctx);
     delete (std::uint64_t*)(model->arch);
     delete (std::uint8_t*)model->hparams;
     delete model;
-};
+}
 
 extern "C" void fairseq2_model_set_inference_ctx(fairseq2_model* model, ggml_context* ctx) {
     model->ctx = ctx;
@@ -92,6 +95,21 @@ extern "C" ggml_tensor* StandardFeedForwardNetwork_forward(
     return seqs;
 }
 
+extern "C" ggml_tensor* SiluFeedForwardNetwork_forward(
+    fairseq2_model& model,
+    const std::string& prefix,
+    ggml_tensor* seqs
+) {
+    seqs = Linear_forward(model, prefix + ".inner_proj", seqs);
+    seqs = ggml_silu(model.ctx, seqs);
+
+    if (has_layer(model, prefix + ".inner_layer_norm")) {
+        seqs = LayerNorm_forward(model, prefix + ".inner_layer_norm", seqs);
+    }
+
+    seqs = Linear_forward(model, prefix + ".output_proj", seqs);
+    return seqs;
+}
 
 ggml_tensor* ggml_flatten_1d(ggml_context* ctx, ggml_tensor* x, int dim) {
     int n_dims = x->n_dims;
@@ -286,6 +304,321 @@ extern "C" ggml_tensor* StandardTransformerEncoderLayer_forward(
     return seqs;
 }
 
+extern "C" ggml_tensor* WaveformToFbank_forward(
+    fairseq2_model& model,
+    const std::string &prefix,
+    ggml_tensor* waveform 
+) {
+    // Hardcoding: num_bins 80, sample rate 16k, always standardize
+    ggml_context* ctx = model.ctx;
+    knf::MelBanksOptions mel_opts{};
+    mel_opts.num_bins = 80;
+
+    knf::FrameExtractionOptions frame_opts{};
+    frame_opts.samp_freq = 16000;
+
+    knf::FbankOptions opts{};
+    opts.frame_opts = frame_opts;
+    opts.mel_opts = mel_opts;
+    
+
+    std::vector<float_t> signal_frame{};
+    std::int32_t num_frames = knf::NumFrames(/*num_samples=*/waveform->ne[0], frame_opts);
+    struct ggml_tensor * output = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 80, num_frames);
+    knf::FbankComputer native_(opts);
+    knf::FeatureWindowFunction window_fn_(native_.GetFrameOptions());
+
+    for (std::int32_t frame_nr = 0; frame_nr < num_frames; ++frame_nr) {
+        signal_frame.resize(0);
+
+        // Extract the frame from the waveform tensor.
+        knf::ExtractWindow(
+            /*sample_offset=*/0,
+            (float *)(waveform->data),
+            waveform->ne[0],
+            frame_nr,
+            frame_opts,
+            window_fn_,
+            &signal_frame);
+
+        native_.Compute(
+            /*signal_raw_log_energy=*/0, /*vtln_warp=*/1.0, &signal_frame, ((float *)(output->data) + frame_nr * 80));
+    }
+    output = ggml_dup(ctx, ggml_transpose(ctx, output));
+    output = ggml_norm(ctx, output, 1e-5);
+    output = ggml_dup(ctx, ggml_transpose(ctx, output));
+    if (output->ne[1] % 2 == 1) {
+        struct ggml_tensor * remove_last = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, output->ne[1]-1);
+        for (int i = 0; i < output->ne[1]-1; ++i) {
+            ((int32_t *) remove_last->data)[i] = i;
+        }
+        output = ggml_get_rows(ctx, output, remove_last);
+    }
+    output = ggml_reshape_2d(ctx, output, output->ne[0] * 2, output->ne[1] / 2);
+    return output;
+}
+
+// TODO: Check if it's possible to merge with standard MHA
+extern "C" ggml_tensor* RelativePositionMHA_forward(
+    fairseq2_model& model,
+    const std::string& prefix,
+    ggml_tensor* seqs
+) {
+    ggml_context* ctx = model.ctx;
+
+    ggml_tensor* residual = seqs;
+    seqs = LayerNorm_forward(model, prefix + "_layer_norm", seqs);
+    // self_attn: qkv
+    struct ggml_tensor * Qcur = Linear_forward(model, prefix + ".q_proj", seqs);
+    struct ggml_tensor * Kcur = Linear_forward(model, prefix + ".k_proj", seqs);
+    struct ggml_tensor * Vcur = Linear_forward(model, prefix + ".v_proj", seqs);
+    
+    // self_attn: rel_pos SDPA
+    int32_t S = seqs->ne[1];
+    int32_t H = 16; // TODO: Make this configurable
+    int32_t n_ctx = 4096;
+    int32_t K_h = seqs->ne[0] / H;
+    
+    int32_t start_index = n_ctx - S;
+    int32_t end_index = n_ctx + S - 1;
+
+    int num_indices = end_index - start_index;
+
+    struct ggml_tensor *rows = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, num_indices);
+    rows->data = malloc(ggml_nbytes(rows));
+
+    for (int i = 0; i < num_indices; i++) {
+        ((int32_t *)rows->data)[i] = start_index + i;
+    }
+    
+    // self_attn: load pos_enc weights & compute_r
+    // In fairseq2 pos_enc weights are calculated on the fly, since some more custom operators might be needed to enable this, 
+    // we store the results (fixed) in checkpoint as model.audio_enc_pos_enc_w and load directly. 
+    struct ggml_tensor * r = ggml_get_rows(ctx, model.tensors["speech_encoder.pos_enc"], rows);
+    r = ggml_mul_mat(ctx, model.tensors[prefix + ".sdpa.r_proj.weight"], r);
+    r = ggml_dup(ctx, ggml_permute(ctx,
+                        ggml_cpy(ctx,
+                            r,
+                            ggml_new_tensor_3d(ctx, GGML_TYPE_F32, K_h, H, S*2-1)),
+                        0, 2, 1, 3));
+    
+    struct ggml_tensor * u_bias = ggml_reshape_3d(ctx, model.tensors[prefix + ".sdpa.u_bias"], K_h, 1, H);
+    struct ggml_tensor * v_bias = ggml_reshape_3d(ctx, model.tensors[prefix + ".sdpa.v_bias"], K_h, 1, H);
+
+    // self_attn: Permute QKV
+    
+    struct ggml_tensor * Q =
+                ggml_dup(ctx, ggml_permute(ctx,
+                        ggml_cpy(ctx,
+                            Qcur,
+                            ggml_new_tensor_3d(ctx, GGML_TYPE_F32, K_h, H, S)),
+                        0, 2, 1, 3)); // (H * K_h, S) -> (K_h, H, S) -> (K_h, S, H)
+    struct ggml_tensor * K = 
+                ggml_dup(ctx, ggml_permute(ctx,
+                        ggml_cpy(ctx,
+                            Kcur,
+                            ggml_new_tensor_3d(ctx, GGML_TYPE_F32, K_h, H, S)),
+                        0, 2, 1, 3)); // (H * K_h, S) -> (K_h, H, S) -> (K_h, S, H)
+    struct ggml_tensor * V = 
+                ggml_dup(ctx, ggml_permute(ctx,
+                        ggml_cpy(ctx,
+                            Vcur,
+                            ggml_new_tensor_3d(ctx, GGML_TYPE_F32, K_h, H, S)),
+                        1, 2, 0, 3)); // (H * K_h, S) -> (K_h, H, S) -> (H, S, K_h)
+    
+    
+    struct ggml_tensor * q_with_u_bias = ggml_add(ctx, Q, u_bias); // (K_h, S, H)
+    struct ggml_tensor * q_with_v_bias = ggml_add(ctx, Q, v_bias); // (K_h, S, H)
+    
+    struct ggml_tensor * ac = ggml_mul_mat(ctx, K, q_with_u_bias);
+    struct ggml_tensor * bd = ggml_mul_mat(ctx, r, q_with_v_bias);
+    
+    
+    // self_attn: shift_bd. Logic follows https://github.com/facebookresearch/fairseq2/blob/main/src/fairseq2/nn/transformer/relative_attention.py#L161
+    bd = ggml_dup(ctx, ggml_permute(ctx, bd, 2, 1, 0, 3)); // H, S, 2S-1
+    
+    struct ggml_tensor * pad = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, H, S, 1);
+    pad->data = malloc(ggml_nbytes(pad));
+
+    pad = ggml_set_f32(pad, 0.0);
+    bd = ggml_concat(ctx, pad, bd); // bd[i][j][0] == 0, (H, S, 2S)
+    bd = ggml_dup(ctx, ggml_permute(ctx, bd, 2, 1, 0, 3)); // (2S, S, H) 
+    bd = ggml_dup(ctx, ggml_reshape_3d(ctx, bd, S, 2*S, H));  // (S, 2S, H)
+    bd = ggml_remove_head_row(ctx, bd); // A custom operator introduced to reduce 1st row (in the 2nd dim)
+
+    bd = ggml_reshape_3d(ctx, bd, 2*S-1, S, H);
+
+    bd = ggml_get_first_cols_by_rows(ctx, bd); // A custom operator introduced to get first #rows cols. 
+    
+
+    // self_attn: compute attn / weights
+    struct ggml_tensor * attn_weights = ggml_add(ctx, ac, bd);
+    struct ggml_tensor * attn_scale = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, 1);
+    attn_scale->data = malloc(ggml_nbytes(attn_scale));
+    ggml_set_f32(attn_scale, 1.0 / pow(K_h, 0.5));
+    attn_weights = ggml_mul(ctx, ggml_repeat(ctx, attn_scale, attn_weights), attn_weights);
+    attn_weights = ggml_soft_max(ctx, attn_weights);
+    
+    struct ggml_tensor * attn = ggml_mul_mat(ctx, V, attn_weights); // K_h, S, H
+    attn = ggml_dup(ctx, ggml_permute(ctx, attn, 0, 2, 1, 3));
+    struct ggml_tensor * attn_2d = ggml_reshape_2d(ctx, attn, K_h * H, S); 
+    
+    struct ggml_tensor * attn_out = ggml_mul_mat(ctx, model.tensors[prefix + ".output_proj.weight"], attn_2d);
+    attn_out = ggml_add(ctx,
+            ggml_repeat(ctx,
+                model.tensors[prefix + ".output_proj.bias"],
+                attn_out),
+            attn_out);
+    attn_out = ggml_add(ctx, residual, attn_out);
+    return attn_out;
+}
+
+extern "C" ggml_tensor* ConvModule_forward(
+    fairseq2_model& model,
+    const std::string& prefix,
+    ggml_tensor* seqs
+) {
+        ggml_context* ctx = model.ctx;
+        ggml_tensor* residual = seqs;
+        seqs = LayerNorm_forward(model, prefix + "_layer_norm", seqs);
+        // conv: Use matmul for pointwise conv 1 - kernel_size=1, no padding case
+        seqs = ggml_mul_mat(ctx, model.tensors[prefix + ".pointwise_conv1.weight"], seqs);
+        
+        // conv: GLU
+        seqs = ggml_glu(ctx, seqs);
+        seqs = ggml_dup(ctx, ggml_permute(ctx, seqs, 1, 0, 2, 3));
+       
+        // S x C -> (S+K-1) x C -> K x S x C -> S x C
+        seqs = ggml_conv_1d(ctx, model.tensors[prefix + ".depthwise_conv.weight"], seqs, 1, 15, 1);
+        
+        // conv: Custom implementation of batch norm
+        seqs = ggml_batch_norm(ctx, seqs, model.tensors[prefix + ".batch_norm.weight"], model.tensors[prefix + ".batch_norm.bias"], model.tensors[prefix + ".batch_norm.running_mean"], model.tensors[prefix + ".batch_norm.running_var"], 1e-5);
+        
+        // conv: SiLU actvation
+        seqs = ggml_silu(ctx, seqs);
+        seqs = ggml_dup(ctx, ggml_permute(ctx, seqs, 1, 0, 2, 3));
+
+        // conv: Use matmul for pointwise conv 2 - kernel_size=1, no padding case
+        seqs = ggml_mul_mat(ctx, model.tensors[prefix + ".pointwise_conv2.weight"], seqs); 
+
+        // conv: + residual
+        seqs = ggml_add(ctx, seqs, residual);
+        return seqs;
+}
+
+extern "C" ggml_tensor* StandardConformerEncoderLayer_forward(
+    fairseq2_model& model,
+    const std::string& prefix,
+    ggml_tensor* seqs,
+    ggml_tensor* padding_mask
+) {
+    ggml_context* ctx = model.ctx;
+    struct ggml_tensor * ffn_scale = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, 1);
+    ffn_scale->data = malloc(ggml_nbytes(ffn_scale));
+    ggml_set_f32(ffn_scale, 0.5f);
+    struct ggml_tensor * residual = seqs;
+    seqs = LayerNorm_forward(model, prefix + ".ffn1_layer_norm", seqs);
+    seqs = SiluFeedForwardNetwork_forward(model, prefix + ".ffn1", seqs);
+    seqs = ggml_mul(ctx, ggml_repeat(ctx, ffn_scale, seqs), seqs);
+    seqs = ggml_add(ctx, seqs, residual);
+    seqs = RelativePositionMHA_forward(model, prefix + ".self_attn", seqs);
+    seqs = ConvModule_forward(model, prefix + ".conv", seqs);
+    residual = seqs;
+    seqs = LayerNorm_forward(model, prefix + ".ffn2_layer_norm", seqs);
+    seqs = SiluFeedForwardNetwork_forward(model, prefix + ".ffn2", seqs);
+    seqs = ggml_mul(ctx, ggml_repeat(ctx, ffn_scale, seqs), seqs);
+    seqs = ggml_add(ctx, seqs, residual);
+    seqs = LayerNorm_forward(model, prefix + ".layer_norm", seqs);
+    return seqs;
+}
+
+extern "C" ggml_tensor* StandardConformerEncoder_forward(
+    fairseq2_model& model,
+    const std::string& prefix,
+    ggml_tensor* seqs,
+    ggml_tensor* padding_mask
+) { // TODO: Implement this!
+    ggml_context* ctx = model.ctx;
+    seqs = WaveformToFbank_forward(model, prefix, seqs);
+    seqs = LayerNorm_forward(model, prefix + "_frontend.post_extract_layer_norm", seqs);
+    seqs = Linear_forward(model, prefix + "_frontend.model_dim_proj", seqs);
+    int layer_idx = 0;
+    
+    std::string layer_name = prefix + ".inner.layers." + std::to_string(layer_idx);
+    
+    while (has_layer(model, layer_name)) {
+        seqs = StandardConformerEncoderLayer_forward(
+            model, layer_name, seqs, padding_mask
+        );
+        ggml_set_name(seqs, ("x_enc_" + std::to_string(layer_idx)).c_str());
+        layer_idx += 1;
+        layer_name = prefix + ".inner.layers." + std::to_string(layer_idx);
+    }
+
+    seqs = LayerNorm_forward(model, prefix + ".inner_layer_norm", seqs);
+    ggml_tensor* residual = seqs;
+    seqs = Linear_forward(model, prefix + ".proj1", seqs);
+    seqs = ggml_relu_inplace(ctx, seqs);
+    seqs = Linear_forward(model, prefix + ".proj2", seqs);
+    struct ggml_tensor * ffn_scale = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, 1);
+    ffn_scale->data = malloc(ggml_nbytes(ffn_scale));
+    ggml_set_f32(ffn_scale, 0.5f);
+    seqs = ggml_mul(ctx, ggml_repeat(ctx, ffn_scale, seqs), seqs);
+    seqs = ggml_add(ctx, seqs, residual);
+    layer_idx = 0;
+    layer_name = prefix + ".adaptor_layers." + std::to_string(layer_idx);
+    while (has_layer(model, layer_name)) {
+        seqs = StandardConformerEncoderAdaptorLayer_forward(
+            model, layer_name, seqs, padding_mask
+        );
+        ggml_set_name(seqs, ("x_ada_" + std::to_string(layer_idx)).c_str());
+        layer_idx += 1;
+        layer_name = prefix + ".adaptor_layers." + std::to_string(layer_idx);
+    }
+    seqs = LayerNorm_forward(model, prefix + ".layer_norm", seqs);
+    
+    return seqs;
+}
+
+extern "C" ggml_tensor* StandardConformerEncoderAdaptorLayer_forward(
+    fairseq2_model& model,
+    const std::string& prefix,
+    ggml_tensor* seqs,
+    ggml_tensor* padding_mask
+) {
+    ggml_context* ctx = model.ctx;
+    struct ggml_tensor * residual = seqs;
+    residual = LayerNorm_forward(model, prefix + ".residual_layer_norm", residual);
+    residual = ggml_dup(ctx, ggml_permute(ctx, residual, 1, 0, 2, 3));
+    residual = ggml_conv_1d_generic(ctx, model.tensors[prefix + ".residual_conv.weight"], residual, 8, 4, 1); 
+    residual = ggml_dup(ctx, ggml_permute(ctx, residual, 1, 0, 2, 3));
+    residual = ggml_add(ctx, ggml_repeat(ctx, model.tensors[prefix + ".residual_conv.bias"], residual), residual);
+    residual = ggml_glu(ctx, residual);
+
+    seqs = LayerNorm_forward(model, prefix + ".self_attn_layer_norm", seqs);
+    seqs = ggml_dup(ctx, ggml_permute(ctx, seqs, 1, 0, 2, 3));
+    seqs = ggml_conv_1d_generic(ctx, model.tensors[prefix + ".self_attn_conv.weight"], seqs, 8, 4, 1);
+    seqs = ggml_dup(ctx, ggml_permute(ctx, seqs, 1, 0, 2, 3));
+    seqs = ggml_add(ctx, ggml_repeat(ctx, model.tensors[prefix + ".self_attn_conv.bias"], seqs), seqs);
+    seqs = ggml_glu(ctx, seqs); 
+    
+    seqs = MultiheadAttention_forward(
+        model,
+        prefix + ".self_attn",
+        seqs,
+        seqs,
+        seqs,
+        /*attention masks=*/nullptr
+    );
+    seqs = ggml_add(ctx, seqs, residual);
+    residual = seqs;
+    seqs = LayerNorm_forward(model, prefix + ".ffn_layer_norm", seqs);
+    seqs = StandardFeedForwardNetwork_forward(model, prefix + ".ffn", seqs);
+    seqs = ggml_add(ctx, seqs, residual);
+    return seqs;
+}
+
+
 /// ggml_slice(X, -1, start, end) is equivalent to X[start:end]
 /// ggml_slice(X, 0, start, end) is equivalent to X[..., start:end]
 struct ggml_tensor * ggml_slice(
@@ -905,7 +1238,7 @@ extern "C" Hypothesis* generate_sequence(
             ggml_graph_compute_with_ctx(ctx, &gf, 1);
             ggml_detach(new_scores);
         }
-
+        
         // new_seqs[:, step_nr + 1] = next_tokens
         // new_scores[:, step_nr + 1] = next_scores
         for (std::size_t i = 0; i < beam_size; ++i) {

+ 51 - 0
ggml/examples/unity/fairseq2.h

@@ -4,6 +4,7 @@
 #include <string>
 #include <vector>
 #include "ggml.h"
+#include "kaldi-native-fbank/csrc/feature-fbank.h"
 
 
 struct fairseq2_model {
@@ -27,6 +28,11 @@ extern "C" void fairseq2_model_set_inference_ctx(fairseq2_model* model, ggml_con
 extern "C" std::string* std_string_alloc(char* c_str);
 extern "C" void std_string_free(std::string* str);
 
+extern "C" ggml_tensor* WaveformToFbank_forward(
+    fairseq2_model& model,
+    const std::string &prefix,
+    ggml_tensor* waveform 
+);
 extern "C" ggml_tensor* ggml_slice(
     struct ggml_context* ctx,
     struct ggml_tensor* a,
@@ -64,6 +70,12 @@ extern "C" ggml_tensor* StandardFeedForwardNetwork_forward(
     ggml_tensor* seqs
 );
 
+extern "C" ggml_tensor* SiluFeedForwardNetwork_forward(
+    fairseq2_model& model,
+    const std::string& prefix,
+    ggml_tensor* seqs
+);
+
 extern "C" ggml_tensor* MultiheadAttention_forward(
     fairseq2_model& model,
     const std::string &prefix,
@@ -93,6 +105,45 @@ extern "C" ggml_tensor* StandardTransformerEncoderLayer_forward(
     ggml_tensor* padding_mask
 );
 
+extern "C" ggml_tensor* RelativePositionMHA_forward(
+    fairseq2_model& model,
+    const std::string& prefix,
+    ggml_tensor* seqs
+);
+
+extern "C" ggml_tensor* ConvModule_forward(
+    fairseq2_model& model,
+    const std::string& prefix,
+    ggml_tensor* seqs
+);
+
+extern "C" ggml_tensor* StandardConformerEncoderLayer_forward(
+    fairseq2_model& model,
+    const std::string& prefix,
+    ggml_tensor* seqs,
+    ggml_tensor* padding_mask
+);
+
+extern "C" ggml_tensor* StandardConformerEncoder_forward(
+    fairseq2_model& model,
+    const std::string& prefix,
+    ggml_tensor* seqs,
+    ggml_tensor* padding_mask
+);
+
+extern "C" ggml_tensor* StandardConformerEncoderAdaptorLayer_forward(
+    fairseq2_model& model,
+    const std::string& prefix,
+    ggml_tensor* seqs,
+    ggml_tensor* padding_mask
+);
+
+extern "C" ggml_tensor* StandardConformerEncoderAdaptor_forward(
+    fairseq2_model& model,
+    const std::string& prefix,
+    ggml_tensor* seqs,
+    ggml_tensor* padding_mask
+);
 // Specifies the Layer Normalization order.
 enum TransformerNormOrder {
     TRANSFORMER_NORM_ORDER_POST = 0,

+ 50 - 1
ggml/ggml_convert.py

@@ -11,6 +11,7 @@ from enum import Enum
 from io import BufferedWriter
 from pathlib import Path
 from typing import Any, Callable, Dict, Optional, Tuple, Union
+import math
 import torch
 import ggml
 from typing import List
@@ -21,6 +22,49 @@ from seamless_communication.models.unity import load_unity_config, load_unity_mo
 
 Preprocessor = Callable[[Any], Any]
 
+def pos_enc(max_seq_len=4096, encoding_dim=1024):
+    weight = torch.empty(
+        ((max_seq_len * 2) - 1, encoding_dim), dtype=torch.float32
+    )
+    # copied from https://github.com/facebookresearch/fairseq2/blob/main/src/fairseq2/nn/transformer/relative_attention.py#L22
+    dtype = torch.float32
+    weight = weight.to(dtype)
+
+    positive_w = weight[: max_seq_len]
+    negative_w = weight[max_seq_len :]
+
+    device = weight.device
+
+    # (E / 2)
+    indices = torch.arange(0, encoding_dim, step=2, device=device, dtype=dtype)
+
+    # (1, E / 2)
+    indices = indices.unsqueeze(0)
+
+    # (S)
+    steps = torch.arange(max_seq_len, device=device, dtype=dtype)
+
+    # (S, 1)
+    steps = steps.unsqueeze(1)
+
+    factors = torch.exp(indices * -math.log(10000) / encoding_dim)
+
+    # (S, 1) x (1, E / 2) -> (S, E / 2)
+    factors = torch.matmul(steps, factors)
+
+    flipped_factors = factors.flip([0])
+
+    # A mirrored matrix of sinusoidal positive and negative positional
+    # encodings to use in shift trick.
+    #
+    # [max, ...,  3,  2,  1,  0, -1, -2, -3, ..., min]
+    torch.sin(flipped_factors, out=positive_w[:, 0::2])
+    torch.cos(flipped_factors, out=positive_w[:, 1::2])
+
+    torch.sin(-1 * factors[1:], out=negative_w[:, 0::2])
+    torch.cos(-1 * factors[1:], out=negative_w[:, 1::2])
+
+    return weight
 
 def convert_model(model_name: str, out: Optional[Path] = None) -> None:
     if out is None:
@@ -87,6 +131,7 @@ def fixup_model(model: torch.nn.Module, state_dict: Dict[str, torch.Tensor]) ->
         assert name not in state_dict
         state_dict[name] = pos_encoder.weight
 
+    state_dict["speech_encoder.pos_enc"] = pos_enc()
 
 def write_ggml_file(
     out: BufferedWriter, hparams: Dict[str, Any], state_dict: Dict[str, torch.Tensor]
@@ -142,9 +187,13 @@ def write_state_dict(out: BufferedWriter, state_dict: Dict[str, torch.Tensor]) -
     """
     for key, value in state_dict.items():
         write_string(out, key)
-        if key.endswith(".bias") and value.ndim == 1:
+        if key.endswith(".bias") and value.ndim == 1 and "adaptor" not in key:
             # GGML broadcasting isn't as strong as numpy
             value = value.reshape(1, -1)
+        if "pointwise_conv" in key: # pointwise_conv / depthwise_conv
+            value = value.squeeze(-1)
+        if "depthwise_conv" in key:
+            value = value.squeeze(1)
         write_tensor(out, value.contiguous())
 
 

+ 36 - 4
ggml/include/ggml/ggml.h

@@ -347,7 +347,7 @@ extern "C" {
         GGML_OP_NONE = 0,
 
         GGML_OP_DUP,
-        GGML_OP_ADD, //2
+        GGML_OP_ADD,
         GGML_OP_ADD1,
         GGML_OP_ACC,
         GGML_OP_SUB,
@@ -367,19 +367,21 @@ extern "C" {
         GGML_OP_GET_FIRST_COLS_BY_ROWS,
         GGML_OP_SILU_BACK,
         GGML_OP_NORM, // normalize
+        GGML_OP_BATCH_NORM, 
         GGML_OP_RMS_NORM,
         GGML_OP_RMS_NORM_BACK,
         GGML_OP_GROUP_NORM,
 
-        GGML_OP_MUL_MAT, //23
+        GGML_OP_MUL_MAT,
         GGML_OP_OUT_PROD,
+
         GGML_OP_SCALE,
         GGML_OP_SET,
         GGML_OP_CPY,
         GGML_OP_CONT,
-        GGML_OP_RESHAPE, //29
+        GGML_OP_RESHAPE,
         GGML_OP_VIEW,
-        GGML_OP_PERMUTE, //32
+        GGML_OP_PERMUTE,
         GGML_OP_TRANSPOSE,
         GGML_OP_GET_ROWS,
         GGML_OP_GET_ROWS_BACK,
@@ -393,11 +395,19 @@ extern "C" {
         GGML_OP_ALIBI,
         GGML_OP_CLAMP,
         GGML_OP_CONV_1D,
+        GGML_OP_CONV_1D_GENERIC,
         GGML_OP_CONV_2D,
         GGML_OP_CONV_TRANSPOSE_2D,
         GGML_OP_POOL_1D,
         GGML_OP_POOL_2D,
 
+        GGML_OP_CONV_1D_STAGE_0,  // internal
+        GGML_OP_CONV_1D_STAGE_1,  // internal
+        GGML_OP_CONV_1D_STAGE_2,  // internal
+
+        GGML_OP_CONV_1D_GENERIC_STAGE_0,
+        GGML_OP_CONV_1D_GENERIC_STAGE_1,  
+
         GGML_OP_UPSCALE, // nearest interpolate
 
         GGML_OP_FLASH_ATTN,
@@ -438,6 +448,7 @@ extern "C" {
         GGML_UNARY_OP_GELU,
         GGML_UNARY_OP_GELU_QUICK,
         GGML_UNARY_OP_SILU,
+        GGML_UNARY_OP_GLU,
     };
 
     enum ggml_object_type {
@@ -937,6 +948,10 @@ extern "C" {
             struct ggml_tensor  * a,
             struct ggml_tensor  * b);
 
+    GGML_API struct ggml_tensor * ggml_glu(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
     // normalize along rows
     GGML_API struct ggml_tensor * ggml_norm(
             struct ggml_context * ctx,
@@ -948,6 +963,15 @@ extern "C" {
             struct ggml_tensor  * a,
             float                 eps);
 
+    GGML_API struct ggml_tensor * ggml_batch_norm(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * gamma,
+            struct ggml_tensor  * beta,
+            struct ggml_tensor  * running_mean,
+            struct ggml_tensor  * running_var,
+            float eps);
+
     GGML_API struct ggml_tensor * ggml_rms_norm(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
@@ -1319,6 +1343,14 @@ extern "C" {
             int                   p0,  // padding
             int                   d0); // dilation
 
+    GGML_API struct ggml_tensor * ggml_conv_1d_generic(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            int                   s0,  // stride
+            int                   p0,  // padding
+            int                   d0); // dilation
+
     // conv_1d with padding = half
     // alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
     GGML_API struct ggml_tensor* ggml_conv_1d_ph(

+ 2 - 2
ggml/src/CMakeLists.txt

@@ -271,9 +271,9 @@ target_include_directories(${TARGET} PUBLIC
     )
 
 if (MSVC)
-    target_link_libraries(${TARGET} PUBLIC ${GGML_EXTRA_LIBS} ${CMAKE_THREAD_LIBS_INIT})
+    target_link_libraries(${TARGET} PUBLIC ${GGML_EXTRA_LIBS} ${CMAKE_THREAD_LIBS_INIT} kaldi-native-fbank)
 else()
-    target_link_libraries(${TARGET} PUBLIC m ${GGML_EXTRA_LIBS} ${CMAKE_THREAD_LIBS_INIT})
+    target_link_libraries(${TARGET} PUBLIC m ${GGML_EXTRA_LIBS} ${CMAKE_THREAD_LIBS_INIT} kaldi-native-fbank)
 endif()
 
 if (BUILD_SHARED_LIBS)

La diferencia del archivo ha sido suprimido porque es demasiado grande
+ 1069 - 71
ggml/src/ggml.c


+ 216 - 2
ggml/test_unity_cpp.py

@@ -18,7 +18,9 @@ from typing import Iterator
 from ggml import NativeObj
 from ggml_convert import convert_model
 from seamless_communication.models.inference.translator import Translator, Modality
-
+from fairseq2.data.audio import WaveformToFbankConverter
+import torchaudio
+from fairseq2.models.wav2vec2.feature_extractor import Wav2Vec2FbankFeatureExtractor
 Ctx = ggml.ggml_context_p
 
 UNITY_MODELS = Path(__file__).parent / "examples/unity/models"
@@ -253,6 +255,60 @@ def test_StandardTransformerEncoderLayer_forward(
     assert y.shape == y_exp.shape
     assert np.allclose(y_exp, y, atol=1e-4 if UNITY_FLASH_ATTN else 1e-2)
 
+def test_StandardConformerEncoderLayer_forward(
+    ctx: Ctx, g_model: c_void_p
+) -> None:
+    pt_model = load_pt_model()
+    x = torch.load("/private/home/dnn/internal_sc/seamless_communication/ggml/examples/unity/dev/seqs_before_conformer_block.pt")
+    padding_mask = torch.ones((1, x.shape[1]))
+    layer = pt_model.speech_encoder.inner.layers[0]
+    gx = ggml.from_numpy(ctx, x[0])
+    ggml.ggml_set_name(gx, b"x")
+    gpad = ggml.from_numpy(ctx, padding_mask[0])
+    ggml.ggml_set_name(gpad, b"padding_mask")
+    gy = ggml.forward(
+        "StandardConformerEncoderLayer",
+        g_model,
+        "speech_encoder.inner.layers.0",
+        gx,
+        None,  # TODO support padding mask
+    )
+    gf = ggml.ggml_build_forward(gy)
+    ggml.ggml_graph_compute_with_ctx(ctx, ctypes.pointer(gf), 1)
+
+    y = ggml.to_numpy(gy)
+
+    y_exp, _ = layer(x, padding_mask)
+    y_exp = y_exp.numpy()  
+    assert y.shape == y_exp.shape
+    assert np.allclose(y_exp, y, atol=2e-3)
+
+def test_StandardConformerEncoderAdaptorLayer_forward(
+    ctx: Ctx, g_model: c_void_p
+) -> None:
+    pt_model = load_pt_model()
+    x = torch.load("/private/home/dnn/internal_sc/seamless_communication/ggml/examples/unity/dev/seqs_before_adaptor.pt")
+    layer = pt_model.speech_encoder.adaptor_layers[0]
+    gx = ggml.from_numpy(ctx, x[0])
+    ggml.ggml_set_name(gx, b"x")
+    gy = ggml.forward(
+        "StandardConformerEncoderAdaptorLayer",
+        g_model,
+        "speech_encoder.adaptor_layers.0",
+        gx,
+        None,  # TODO support padding mask
+    )
+    gf = ggml.ggml_build_forward(gy)
+    ggml.ggml_graph_compute_with_ctx(ctx, ctypes.pointer(gf), 1)
+
+    y = ggml.to_numpy(gy)
+
+    y_exp, _ = layer(x, None)
+    y_exp = y_exp.numpy() 
+
+    assert y.shape == y_exp.shape
+    assert np.allclose(y_exp, y, atol=2e-3)
+
 
 def test_StandardTransformerEncoder_forward(
     ctx: Ctx, g_model: c_void_p
@@ -283,7 +339,97 @@ def test_StandardTransformerEncoder_forward(
     y_exp = y_exp.numpy()
 
     assert y.shape == y_exp.shape
-    assert np.allclose(y_exp, y, atol=1e-4 if UNITY_FLASH_ATTN else 1e-2)
+    assert np.allclose(y_exp, y, atol=1e-4)
+
+def test_StandardConformerEncoder_forward(
+    ctx: Ctx, g_model: c_void_p
+) -> None:
+    pt_model = load_pt_model()
+    wav, _ = torchaudio.load("/private/home/dnn/internal_sc/seamless_communication/ggml/examples/unity/test.wav")
+    gx = ggml.from_numpy(ctx, wav * 2**15) # Apply scale before sending into ggml!
+    ggml.ggml_set_name(gx, b"x")
+    gy = ggml.forward(
+        "StandardConformerEncoder",
+        g_model,
+        "speech_encoder",
+        gx,
+        None,  # TODO support padding mask
+    )
+    gf = ggml.ggml_build_forward(gy)
+    ggml.ggml_graph_compute_with_ctx(ctx, ctypes.pointer(gf), 1)
+
+    converter = WaveformToFbankConverter(
+        num_mel_bins=80,
+        waveform_scale=2**15,
+        channel_last=True,
+        standardize=True,
+    )
+    converter_input = {
+        "waveform": wav.transpose(0, 1),
+        "sample_rate": 16000.,
+        "format": -1,
+    }
+
+    y = ggml.to_numpy(gy)
+    speech_encoder_input = pt_model.speech_encoder_frontend(converter(converter_input)["fbank"].unsqueeze(0), None)[0]
+
+    y_exp, _ = pt_model.speech_encoder(speech_encoder_input, None)
+    y_exp = y_exp.numpy()  # remove batch dimension
+
+    assert y.shape == y_exp.shape
+    assert np.allclose(y_exp, y, atol=1e-2) # There are 10 elements in a 137*1024 tensor with error >1e-2
+
+    
+
+def test_WaveformToFbank_forward(
+    ctx: Ctx, g_model: c_void_p
+) -> None:
+    pt_model = load_pt_model()
+    converter = WaveformToFbankConverter(
+        num_mel_bins=80,
+        waveform_scale=2**15,
+        channel_last=True,
+        standardize=True,
+    )
+    extractor = Wav2Vec2FbankFeatureExtractor(80, 2, 1)
+    wav, _ = torchaudio.load("/private/home/dnn/internal_sc/seamless_communication/ggml/examples/unity/test.wav")
+    gx = ggml.from_numpy(ctx, wav * 2**15) # Apply scale before sending into ggml!
+    ggml.ggml_set_name(gx, b"x")
+    
+    gy = ggml.forward(
+        "WaveformToFbank",
+        g_model,
+        "",
+        gx
+    )
+    gf = ggml.ggml_build_forward(gy)
+    ggml.ggml_graph_compute_with_ctx(ctx, ctypes.pointer(gf), 1)
+
+    y = ggml.to_numpy(gy)
+    converter_input = {
+        "waveform": wav.transpose(0, 1),
+        "sample_rate": 16000.,
+        "format": -1,
+    }
+    y_exp = extractor(converter(converter_input)["fbank"].unsqueeze(0), None)[0]
+    y_exp = y_exp.numpy() 
+
+    assert y.shape == y_exp.shape
+    assert np.allclose(y_exp, y, atol=4e-3) # reduce? error is from standardization
+
+
+def test_causal_attention_mask(ctx: Ctx):
+    x = torch.zeros((5, 10))
+    generator = fairseq2.nn.transformer.CausalAttentionMaskGenerator()
+    mask_exp = generator(x)
+
+    gx = ggml.from_numpy(ctx, x)
+    gmask = ggml.causal_attention_mask(ctx, gx)
+    mask = ggml.to_numpy(gmask)
+
+    assert mask_exp.shape == (10, 10)
+    assert mask.shape == (10, 10)
+    assert np.allclose(mask, mask_exp)
 
 
 def test_PositionalEmbedding_forward(ctx: Ctx, g_model: c_void_p) -> None:
@@ -443,3 +589,71 @@ def test_t2tt(ctx: Ctx, g_model: c_void_p):
         # The score error is big, this may negatively impact the beam search.
         assert np.allclose(g_step_scores, exp["step_scores"], atol=0.1)
 
+def test_s2tt(ctx: Ctx, g_model: c_void_p):
+    src_audio_wav, _ = torchaudio.load("/private/home/dnn/internal_sc/seamless_communication/ggml/examples/unity/test.wav")
+    # translator = load_translator()
+    # token_encoder = translator.text_tokenizer.create_encoder(
+    #     task="translation"
+    # )
+    # decoded_audio = {
+    #     "waveform": src_audio_wav.t(),
+    #     "sample_rate": 16000.,
+    #     "format": -1,
+    # }
+    # src = translator.collate(translator.convert_to_fbank(decoded_audio))["fbank"]
+
+    # text_out, _ = translator.get_prediction(
+    #     translator.model,
+    #     translator.text_tokenizer,
+    #     translator.unit_tokenizer,
+    #     src,
+    #     input_modality=Modality.SPEECH,
+    #     output_modality=Modality.TEXT,
+    #     tgt_lang="cmn",
+    # )
+
+    # tgt_text = str(text_out.sentences[0])
+    # assert tgt_text == "大家好 , 世界无主题。"
+    # tgt_tokens = text_out.generator_output.results[0][0].seq
+    # score = text_out.generator_output.results[0][0].score.item()
+
+    tgt_tokens = [     3, 256200,  16991, 249346, 249725,    146,  25220, 251069, 249211,
+        251148, 253935,      3]
+    score = -1.606838583946228
+    gx = ggml.from_numpy(ctx, src_audio_wav * 2**15) # Apply scale before sending into ggml!
+    ggml.ggml_set_name(gx, b"x")
+    gy = ggml.forward(
+        "StandardConformerEncoder",
+        g_model,
+        "speech_encoder",
+        gx,
+        None,  # TODO support padding mask
+    )
+    gf = ggml.ggml_build_forward(gy)
+    ggml.ggml_graph_compute_with_ctx(ctx, ctypes.pointer(gf), 1)
+
+    encoder_out = gy
+
+    job = ggml.SequenceGeneratorJob()
+    job.opts.beam_size = 1
+    job.opts.min_seq_len = 1
+    job.opts.soft_max_seq_len_a = 1
+    job.opts.soft_max_seq_len_b = 200
+    job.opts.hard_max_seq_len = 20
+    job.opts.len_penalty = 1.0
+    job.opts.unk_penalty = 0.0
+    job.prefix_seq = ggml.from_numpy(ctx, np.array([3, 256200]).astype(np.int32))
+    job.opts.normalize_scores = True
+    job.pad_idx = 0
+    job.unk_idx = 1
+    job.bos_idx = 2
+    job.eos_idx = 3
+
+    result = ggml.ggml_tensor()
+    
+    g_score = ggml.generate_sequence(
+        g_model, job, encoder_out, None, ctypes.byref(result)
+    )
+    tokens = list(ggml.to_numpy(result))
+    assert tokens == tgt_tokens
+    assert g_score == pytest.approx(score)

+ 0 - 1
ggml/third_party_ggml.py

@@ -60,7 +60,6 @@ from typing import Type
 from typing import Callable
 from typing import Tuple
 from typing import Dict
-from typing_extensions import Self
 from typing import Any
 from pathlib import Path
 from typing import List, Optional, Sequence, Union

Algunos archivos no se mostraron porque demasiados archivos cambiaron en este cambio