2 years ago · 1589c529bf
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -4,6 +4,7 @@ project(ggml VERSION 0.1.0)
 
				 set(CMAKE_EXPORT_COMPILE_COMMANDS "on")
			
 
				 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
			
 
				 set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib")
			
 
				+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
			
 
				 
			
 
				 if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
			
 
				     set(GGML_STANDALONE ON)
			
@@ -145,12 +146,20 @@ endif()
 
				 # dependencies
			
 
				 
			
 
				 set(CMAKE_C_STANDARD   11)
			
 
				-set(CMAKE_CXX_STANDARD 11)
			
 
				+set(CMAKE_CXX_STANDARD 14)
			
 
				 
			
 
				 find_package(Threads REQUIRED)
			
 
				 
			
 
				 # main
			
 
				 
			
 
				+file(GLOB KALDI_NATIVE_FBANK_SOURCES
			
 
				+     "${CMAKE_CURRENT_SOURCE_DIR}/examples/kaldi-native-fbank/csrc/*"
			
 
				+)
			
 
				+add_library(kaldi-native-fbank STATIC ${KALDI_NATIVE_FBANK_SOURCES})
			
 
				+target_include_directories(kaldi-native-fbank PUBLIC
			
 
				+  ${CMAKE_CURRENT_SOURCE_DIR}/examples/kaldi-native-fbank/csrc
			
 
				+)
			
 
				+
			
 
				 if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
			
 
				     set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
			
 
				     set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "RelWithDebInfo")
			
--- a/ggml/examples/kaldi-native-fbank/CMakeLists.txt
+++ b/ggml/examples/kaldi-native-fbank/CMakeLists.txt
@@ -0,0 +1,8 @@
 
				+add_subdirectory(csrc)
			
 
				+
			
 
				+if(KALDI_NATIVE_FBANK_BUILD_PYTHON)
			
 
				+  message(STATUS "Building Python")
			
 
				+  add_subdirectory(python)
			
 
				+else()
			
 
				+  message(STATUS "Disable building Python")
			
 
				+endif()
			
--- a/ggml/examples/kaldi-native-fbank/csrc/CMakeLists.txt
+++ b/ggml/examples/kaldi-native-fbank/csrc/CMakeLists.txt
@@ -0,0 +1,93 @@
 
				+
			
 
				+include_directories(${PROJECT_SOURCE_DIR})
			
 
				+set(sources
			
 
				+  feature-fbank.cc
			
 
				+  feature-functions.cc
			
 
				+  feature-window.cc
			
 
				+  fftsg.c
			
 
				+  mel-computations.cc
			
 
				+  online-feature.cc
			
 
				+  rfft.cc
			
 
				+)
			
 
				+
			
 
				+if(KALDI_NATIVE_FBANK_ENABLE_CHECK)
			
 
				+  list(APPEND sources log.cc)
			
 
				+endif()
			
 
				+
			
 
				+add_library(kaldi-native-fbank-core ${sources})
			
 
				+if(KALDI_NATIVE_FBANK_ENABLE_CHECK)
			
 
				+  target_compile_definitions(kaldi-native-fbank-core PUBLIC KNF_ENABLE_CHECK=1)
			
 
				+
			
 
				+  if(KNF_HAVE_EXECINFO_H)
			
 
				+    target_compile_definitions(kaldi-native-fbank-core PRIVATE KNF_HAVE_EXECINFO_H=1)
			
 
				+  endif()
			
 
				+
			
 
				+  if(KNF_HAVE_CXXABI_H)
			
 
				+    target_compile_definitions(kaldi-native-fbank-core PRIVATE KNF_HAVE_CXXABI_H=1)
			
 
				+  endif()
			
 
				+endif()
			
 
				+
			
 
				+# We are using std::call_once() in log.h,which requires us to link with -pthread
			
 
				+if(NOT WIN32 AND KALDI_NATIVE_FBANK_ENABLE_CHECK)
			
 
				+  target_link_libraries(kaldi-native-fbank-core -pthread)
			
 
				+endif()
			
 
				+
			
 
				+if(KALDI_NATIVE_FBANK_BUILD_TESTS)
			
 
				+  add_executable(test-online-fbank test-online-fbank.cc)
			
 
				+  target_link_libraries(test-online-fbank kaldi-native-fbank-core)
			
 
				+endif()
			
 
				+
			
 
				+function(kaldi_native_fbank_add_test source)
			
 
				+  get_filename_component(name ${source} NAME_WE)
			
 
				+  add_executable(${name} "${source}")
			
 
				+  target_link_libraries(${name}
			
 
				+    PRIVATE
			
 
				+      kaldi-native-fbank-core
			
 
				+      gtest
			
 
				+      gtest_main
			
 
				+  )
			
 
				+
			
 
				+  add_test(NAME "Test.${name}"
			
 
				+    COMMAND
			
 
				+    $<TARGET_FILE:${name}>
			
 
				+  )
			
 
				+endfunction()
			
 
				+
			
 
				+# please sort the source files alphabetically
			
 
				+set(test_srcs
			
 
				+  # test-online-feature.cc
			
 
				+  test-log.cc
			
 
				+  test-rfft.cc
			
 
				+)
			
 
				+
			
 
				+if(KALDI_NATIVE_FBANK_BUILD_TESTS)
			
 
				+  foreach(source IN LISTS test_srcs)
			
 
				+    kaldi_native_fbank_add_test(${source})
			
 
				+  endforeach()
			
 
				+endif()
			
 
				+
			
 
				+install(TARGETS kaldi-native-fbank-core
			
 
				+  DESTINATION lib
			
 
				+)
			
 
				+
			
 
				+if(KALDI_NATIVE_FBANK_BUILD_TESTS)
			
 
				+  install(TARGETS test-online-fbank
			
 
				+    DESTINATION bin
			
 
				+  )
			
 
				+endif()
			
 
				+
			
 
				+file(MAKE_DIRECTORY
			
 
				+  DESTINATION
			
 
				+    ${PROJECT_BINARY_DIR}/include/kaldi-native-fbank/csrc
			
 
				+)
			
 
				+file(GLOB_RECURSE all_headers *.h)
			
 
				+
			
 
				+file(COPY
			
 
				+  ${all_headers}
			
 
				+  DESTINATION
			
 
				+    ${PROJECT_BINARY_DIR}/include/kaldi-native-fbank/csrc
			
 
				+)
			
 
				+
			
 
				+install(FILES ${all_headers}
			
 
				+  DESTINATION include/kaldi-native-fbank/csrc
			
 
				+)
			
--- a/ggml/examples/kaldi-native-fbank/csrc/feature-fbank.cc
+++ b/ggml/examples/kaldi-native-fbank/csrc/feature-fbank.cc
@@ -0,0 +1,120 @@
 
				+/**
			
 
				+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
			
 
				+ *
			
 
				+ * See LICENSE for clarification regarding multiple authors
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+// This file is copied/modified from kaldi/src/feat/feature-fbank.cc
			
 
				+//
			
 
				+#include "feature-fbank.h"
			
 
				+
			
 
				+#include <algorithm>
			
 
				+#include <cmath>
			
 
				+#include <limits>
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "feature-functions.h"
			
 
				+
			
 
				+namespace knf {
			
 
				+
			
 
				+static void Sqrt(float *in_out, int32_t n) {
			
 
				+  for (int32_t i = 0; i != n; ++i) {
			
 
				+    in_out[i] = std::sqrt(in_out[i]);
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+std::ostream &operator<<(std::ostream &os, const FbankOptions &opts) {
			
 
				+  os << opts.ToString();
			
 
				+  return os;
			
 
				+}
			
 
				+
			
 
				+FbankComputer::FbankComputer(const FbankOptions &opts)
			
 
				+    : opts_(opts), rfft_(opts.frame_opts.PaddedWindowSize()) {
			
 
				+  if (opts.energy_floor > 0.0f) {
			
 
				+    log_energy_floor_ = logf(opts.energy_floor);
			
 
				+  }
			
 
				+
			
 
				+  // We'll definitely need the filterbanks info for VTLN warping factor 1.0.
			
 
				+  // [note: this call caches it.]
			
 
				+  GetMelBanks(1.0f);
			
 
				+}
			
 
				+
			
 
				+FbankComputer::~FbankComputer() {
			
 
				+  for (auto iter = mel_banks_.begin(); iter != mel_banks_.end(); ++iter)
			
 
				+    delete iter->second;
			
 
				+}
			
 
				+
			
 
				+const MelBanks *FbankComputer::GetMelBanks(float vtln_warp) {
			
 
				+  MelBanks *this_mel_banks = nullptr;
			
 
				+
			
 
				+  // std::map<float, MelBanks *>::iterator iter = mel_banks_.find(vtln_warp);
			
 
				+  auto iter = mel_banks_.find(vtln_warp);
			
 
				+  if (iter == mel_banks_.end()) {
			
 
				+    this_mel_banks = new MelBanks(opts_.mel_opts, opts_.frame_opts, vtln_warp);
			
 
				+    mel_banks_[vtln_warp] = this_mel_banks;
			
 
				+  } else {
			
 
				+    this_mel_banks = iter->second;
			
 
				+  }
			
 
				+  return this_mel_banks;
			
 
				+}
			
 
				+
			
 
				+void FbankComputer::Compute(float signal_raw_log_energy, float vtln_warp,
			
 
				+                            std::vector<float> *signal_frame, float *feature) {
			
 
				+  const MelBanks &mel_banks = *(GetMelBanks(vtln_warp));
			
 
				+
			
 
				+  KNF_CHECK_EQ(signal_frame->size(), opts_.frame_opts.PaddedWindowSize());
			
 
				+
			
 
				+  // Compute energy after window function (not the raw one).
			
 
				+  if (opts_.use_energy && !opts_.raw_energy) {
			
 
				+    signal_raw_log_energy = std::log(
			
 
				+        std::max<float>(InnerProduct(signal_frame->data(), signal_frame->data(),
			
 
				+                                     signal_frame->size()),
			
 
				+                        std::numeric_limits<float>::epsilon()));
			
 
				+  }
			
 
				+  rfft_.Compute(signal_frame->data());  // signal_frame is modified in-place
			
 
				+  ComputePowerSpectrum(signal_frame);
			
 
				+
			
 
				+  // Use magnitude instead of power if requested.
			
 
				+  if (!opts_.use_power) {
			
 
				+    Sqrt(signal_frame->data(), signal_frame->size() / 2 + 1);
			
 
				+  }
			
 
				+
			
 
				+  int32_t mel_offset = ((opts_.use_energy && !opts_.htk_compat) ? 1 : 0);
			
 
				+
			
 
				+  // Its length is opts_.mel_opts.num_bins
			
 
				+  float *mel_energies = feature + mel_offset;
			
 
				+
			
 
				+  // Sum with mel filter banks over the power spectrum
			
 
				+  mel_banks.Compute(signal_frame->data(), mel_energies);
			
 
				+
			
 
				+  if (opts_.use_log_fbank) {
			
 
				+    // Avoid log of zero (which should be prevented anyway by dithering).
			
 
				+    for (int32_t i = 0; i != opts_.mel_opts.num_bins; ++i) {
			
 
				+      auto t = std::max(mel_energies[i], std::numeric_limits<float>::epsilon());
			
 
				+      mel_energies[i] = std::log(t);
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  // Copy energy as first value (or the last, if htk_compat == true).
			
 
				+  if (opts_.use_energy) {
			
 
				+    if (opts_.energy_floor > 0.0 && signal_raw_log_energy < log_energy_floor_) {
			
 
				+      signal_raw_log_energy = log_energy_floor_;
			
 
				+    }
			
 
				+    int32_t energy_index = opts_.htk_compat ? opts_.mel_opts.num_bins : 0;
			
 
				+    feature[energy_index] = signal_raw_log_energy;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+}  // namespace knf
			
--- a/ggml/examples/kaldi-native-fbank/csrc/feature-fbank.h
+++ b/ggml/examples/kaldi-native-fbank/csrc/feature-fbank.h
@@ -0,0 +1,134 @@
 
				+/**
			
 
				+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
			
 
				+ *
			
 
				+ * See LICENSE for clarification regarding multiple authors
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+// This file is copied/modified from kaldi/src/feat/feature-fbank.h
			
 
				+
			
 
				+#ifndef KALDI_NATIVE_FBANK_CSRC_FEATURE_FBANK_H_
			
 
				+#define KALDI_NATIVE_FBANK_CSRC_FEATURE_FBANK_H_
			
 
				+
			
 
				+#include <map>
			
 
				+#include <string>
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "feature-window.h"
			
 
				+#include "mel-computations.h"
			
 
				+#include "rfft.h"
			
 
				+
			
 
				+namespace knf {
			
 
				+
			
 
				+struct FbankOptions {
			
 
				+  FrameExtractionOptions frame_opts;
			
 
				+  MelBanksOptions mel_opts;
			
 
				+  // append an extra dimension with energy to the filter banks
			
 
				+  bool use_energy = false;
			
 
				+  float energy_floor = 0.0f;  // active iff use_energy==true
			
 
				+
			
 
				+  // If true, compute log_energy before preemphasis and windowing
			
 
				+  // If false, compute log_energy after preemphasis ans windowing
			
 
				+  bool raw_energy = true;  // active iff use_energy==true
			
 
				+
			
 
				+  // If true, put energy last (if using energy)
			
 
				+  // If false, put energy first
			
 
				+  bool htk_compat = false;  // active iff use_energy==true
			
 
				+
			
 
				+  // if true (default), produce log-filterbank, else linear
			
 
				+  bool use_log_fbank = true;
			
 
				+
			
 
				+  // if true (default), use power in filterbank
			
 
				+  // analysis, else magnitude.
			
 
				+  bool use_power = true;
			
 
				+
			
 
				+  FbankOptions() { mel_opts.num_bins = 23; }
			
 
				+
			
 
				+  std::string ToString() const {
			
 
				+    std::ostringstream os;
			
 
				+    os << "frame_opts: \n";
			
 
				+    os << frame_opts << "\n";
			
 
				+    os << "\n";
			
 
				+
			
 
				+    os << "mel_opts: \n";
			
 
				+    os << mel_opts << "\n";
			
 
				+
			
 
				+    os << "use_energy: " << use_energy << "\n";
			
 
				+    os << "energy_floor: " << energy_floor << "\n";
			
 
				+    os << "raw_energy: " << raw_energy << "\n";
			
 
				+    os << "htk_compat: " << htk_compat << "\n";
			
 
				+    os << "use_log_fbank: " << use_log_fbank << "\n";
			
 
				+    os << "use_power: " << use_power << "\n";
			
 
				+    return os.str();
			
 
				+  }
			
 
				+};
			
 
				+
			
 
				+std::ostream &operator<<(std::ostream &os, const FbankOptions &opts);
			
 
				+
			
 
				+class FbankComputer {
			
 
				+ public:
			
 
				+  using Options = FbankOptions;
			
 
				+
			
 
				+  explicit FbankComputer(const FbankOptions &opts);
			
 
				+  ~FbankComputer();
			
 
				+
			
 
				+  int32_t Dim() const {
			
 
				+    return opts_.mel_opts.num_bins + (opts_.use_energy ? 1 : 0);
			
 
				+  }
			
 
				+
			
 
				+  // if true, compute log_energy_pre_window but after dithering and dc removal
			
 
				+  bool NeedRawLogEnergy() const { return opts_.use_energy && opts_.raw_energy; }
			
 
				+
			
 
				+  const FrameExtractionOptions &GetFrameOptions() const {
			
 
				+    return opts_.frame_opts;
			
 
				+  }
			
 
				+
			
 
				+  const FbankOptions &GetOptions() const { return opts_; }
			
 
				+
			
 
				+  /**
			
 
				+     Function that computes one frame of features from
			
 
				+     one frame of signal.
			
 
				+
			
 
				+     @param [in] signal_raw_log_energy The log-energy of the frame of the signal
			
 
				+         prior to windowing and pre-emphasis, or
			
 
				+         log(numeric_limits<float>::min()), whichever is greater.  Must be
			
 
				+         ignored by this function if this class returns false from
			
 
				+         this->NeedsRawLogEnergy().
			
 
				+     @param [in] vtln_warp  The VTLN warping factor that the user wants
			
 
				+         to be applied when computing features for this utterance.  Will
			
 
				+         normally be 1.0, meaning no warping is to be done.  The value will
			
 
				+         be ignored for feature types that don't support VLTN, such as
			
 
				+         spectrogram features.
			
 
				+     @param [in] signal_frame  One frame of the signal,
			
 
				+       as extracted using the function ExtractWindow() using the options
			
 
				+       returned by this->GetFrameOptions().  The function will use the
			
 
				+       vector as a workspace, which is why it's a non-const pointer.
			
 
				+     @param [out] feature  Pointer to a vector of size this->Dim(), to which
			
 
				+         the computed feature will be written. It should be pre-allocated.
			
 
				+  */
			
 
				+  void Compute(float signal_raw_log_energy, float vtln_warp,
			
 
				+               std::vector<float> *signal_frame, float *feature);
			
 
				+
			
 
				+ private:
			
 
				+  const MelBanks *GetMelBanks(float vtln_warp);
			
 
				+
			
 
				+  FbankOptions opts_;
			
 
				+  float log_energy_floor_;
			
 
				+  std::map<float, MelBanks *> mel_banks_;  // float is VTLN coefficient.
			
 
				+  Rfft rfft_;
			
 
				+};
			
 
				+
			
 
				+}  // namespace knf
			
 
				+
			
 
				+#endif  // KALDI_NATIVE_FBANK_CSRC_FEATURE_FBANK_H_
			
--- a/ggml/examples/kaldi-native-fbank/csrc/feature-functions.cc
+++ b/ggml/examples/kaldi-native-fbank/csrc/feature-functions.cc
@@ -0,0 +1,49 @@
 
				+/**
			
 
				+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
			
 
				+ *
			
 
				+ * See LICENSE for clarification regarding multiple authors
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+// This file is copied/modified from kaldi/src/feat/feature-functions.cc
			
 
				+
			
 
				+#include "feature-functions.h"
			
 
				+
			
 
				+#include <cstdint>
			
 
				+#include <vector>
			
 
				+
			
 
				+namespace knf {
			
 
				+
			
 
				+void ComputePowerSpectrum(std::vector<float> *complex_fft) {
			
 
				+  int32_t dim = complex_fft->size();
			
 
				+
			
 
				+  // now we have in complex_fft, first half of complex spectrum
			
 
				+  // it's stored as [real0, realN/2, real1, im1, real2, im2, ...]
			
 
				+
			
 
				+  float *p = complex_fft->data();
			
 
				+  int32_t half_dim = dim / 2;
			
 
				+  float first_energy = p[0] * p[0];
			
 
				+  float last_energy = p[1] * p[1];  // handle this special case
			
 
				+
			
 
				+  for (int32_t i = 1; i < half_dim; ++i) {
			
 
				+    float real = p[i * 2];
			
 
				+    float im = p[i * 2 + 1];
			
 
				+    p[i] = real * real + im * im;
			
 
				+  }
			
 
				+  p[0] = first_energy;
			
 
				+  p[half_dim] = last_energy;  // Will actually never be used, and anyway
			
 
				+  // if the signal has been bandlimited sensibly this should be zero.
			
 
				+}
			
 
				+
			
 
				+}  // namespace knf
			
--- a/ggml/examples/kaldi-native-fbank/csrc/feature-functions.h
+++ b/ggml/examples/kaldi-native-fbank/csrc/feature-functions.h
@@ -0,0 +1,38 @@
 
				+/**
			
 
				+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
			
 
				+ *
			
 
				+ * See LICENSE for clarification regarding multiple authors
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+// This file is copied/modified from kaldi/src/feat/feature-functions.h
			
 
				+#ifndef KALDI_NATIVE_FBANK_CSRC_FEATURE_FUNCTIONS_H_
			
 
				+#define KALDI_NATIVE_FBANK_CSRC_FEATURE_FUNCTIONS_H_
			
 
				+
			
 
				+#include <vector>
			
 
				+namespace knf {
			
 
				+
			
 
				+// ComputePowerSpectrum converts a complex FFT (as produced by the FFT
			
 
				+// functions in csrc/rfft.h), and converts it into
			
 
				+// a power spectrum.  If the complex FFT is a vector of size n (representing
			
 
				+// half of the complex FFT of a real signal of size n, as described there),
			
 
				+// this function computes in the first (n/2) + 1 elements of it, the
			
 
				+// energies of the fft bins from zero to the Nyquist frequency.  Contents of the
			
 
				+// remaining (n/2) - 1 elements are undefined at output.
			
 
				+
			
 
				+void ComputePowerSpectrum(std::vector<float> *complex_fft);
			
 
				+
			
 
				+}  // namespace knf
			
 
				+
			
 
				+#endif  // KALDI_NATIVE_FBANK_CSRC_FEATURE_FUNCTIONS_H_
			
--- a/ggml/examples/kaldi-native-fbank/csrc/feature-window.cc
+++ b/ggml/examples/kaldi-native-fbank/csrc/feature-window.cc
@@ -0,0 +1,235 @@
 
				+// kaldi-native-fbank/csrc/feature-window.cc
			
 
				+//
			
 
				+// Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
			
 
				+
			
 
				+// This file is copied/modified from kaldi/src/feat/feature-window.cc
			
 
				+
			
 
				+#include "feature-window.h"
			
 
				+
			
 
				+#include <algorithm>
			
 
				+#include <cmath>
			
 
				+#include <limits>
			
 
				+#include <vector>
			
 
				+
			
 
				+#ifndef M_2PI
			
 
				+#define M_2PI 6.283185307179586476925286766559005
			
 
				+#endif
			
 
				+
			
 
				+namespace knf {
			
 
				+
			
 
				+std::ostream &operator<<(std::ostream &os, const FrameExtractionOptions &opts) {
			
 
				+  os << opts.ToString();
			
 
				+  return os;
			
 
				+}
			
 
				+
			
 
				+FeatureWindowFunction::FeatureWindowFunction(const FrameExtractionOptions &opts)
			
 
				+    : window_(opts.WindowSize()) {
			
 
				+  int32_t frame_length = opts.WindowSize();
			
 
				+  KNF_CHECK_GT(frame_length, 0);
			
 
				+
			
 
				+  float *window_data = window_.data();
			
 
				+
			
 
				+  double a = M_2PI / (frame_length - 1);
			
 
				+  for (int32_t i = 0; i < frame_length; i++) {
			
 
				+    double i_fl = static_cast<double>(i);
			
 
				+    if (opts.window_type == "hanning") {
			
 
				+      window_data[i] = 0.5 - 0.5 * cos(a * i_fl);
			
 
				+    } else if (opts.window_type == "sine") {
			
 
				+      // when you are checking ws wikipedia, please
			
 
				+      // note that 0.5 * a = M_PI/(frame_length-1)
			
 
				+      window_data[i] = sin(0.5 * a * i_fl);
			
 
				+    } else if (opts.window_type == "hamming") {
			
 
				+      window_data[i] = 0.54 - 0.46 * cos(a * i_fl);
			
 
				+    } else if (opts.window_type ==
			
 
				+               "povey") {  // like hamming but goes to zero at edges.
			
 
				+      window_data[i] = pow(0.5 - 0.5 * cos(a * i_fl), 0.85);
			
 
				+    } else if (opts.window_type == "rectangular") {
			
 
				+      window_data[i] = 1.0;
			
 
				+    } else if (opts.window_type == "blackman") {
			
 
				+      window_data[i] = opts.blackman_coeff - 0.5 * cos(a * i_fl) +
			
 
				+                       (0.5 - opts.blackman_coeff) * cos(2 * a * i_fl);
			
 
				+    } else {
			
 
				+      KNF_LOG(FATAL) << "Invalid window type " << opts.window_type;
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+void FeatureWindowFunction::Apply(float *wave) const {
			
 
				+  int32_t window_size = window_.size();
			
 
				+  const float *p = window_.data();
			
 
				+  for (int32_t k = 0; k != window_size; ++k) {
			
 
				+    wave[k] *= p[k];
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+int64_t FirstSampleOfFrame(int32_t frame, const FrameExtractionOptions &opts) {
			
 
				+  int64_t frame_shift = opts.WindowShift();
			
 
				+  if (opts.snip_edges) {
			
 
				+    return frame * frame_shift;
			
 
				+  } else {
			
 
				+    int64_t midpoint_of_frame = frame_shift * frame + frame_shift / 2,
			
 
				+            beginning_of_frame = midpoint_of_frame - opts.WindowSize() / 2;
			
 
				+    return beginning_of_frame;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+int32_t NumFrames(int64_t num_samples, const FrameExtractionOptions &opts,
			
 
				+                  bool flush /*= true*/) {
			
 
				+  int64_t frame_shift = opts.WindowShift();
			
 
				+  int64_t frame_length = opts.WindowSize();
			
 
				+  if (opts.snip_edges) {
			
 
				+    // with --snip-edges=true (the default), we use a HTK-like approach to
			
 
				+    // determining the number of frames-- all frames have to fit completely into
			
 
				+    // the waveform, and the first frame begins at sample zero.
			
 
				+    if (num_samples < frame_length)
			
 
				+      return 0;
			
 
				+    else
			
 
				+      return (1 + ((num_samples - frame_length) / frame_shift));
			
 
				+    // You can understand the expression above as follows: 'num_samples -
			
 
				+    // frame_length' is how much room we have to shift the frame within the
			
 
				+    // waveform; 'frame_shift' is how much we shift it each time; and the ratio
			
 
				+    // is how many times we can shift it (integer arithmetic rounds down).
			
 
				+  } else {
			
 
				+    // if --snip-edges=false, the number of frames is determined by rounding the
			
 
				+    // (file-length / frame-shift) to the nearest integer.  The point of this
			
 
				+    // formula is to make the number of frames an obvious and predictable
			
 
				+    // function of the frame shift and signal length, which makes many
			
 
				+    // segmentation-related questions simpler.
			
 
				+    //
			
 
				+    // Because integer division in C++ rounds toward zero, we add (half the
			
 
				+    // frame-shift minus epsilon) before dividing, to have the effect of
			
 
				+    // rounding towards the closest integer.
			
 
				+    int32_t num_frames = (num_samples + (frame_shift / 2)) / frame_shift;
			
 
				+
			
 
				+    if (flush) return num_frames;
			
 
				+
			
 
				+    // note: 'end' always means the last plus one, i.e. one past the last.
			
 
				+    int64_t end_sample_of_last_frame =
			
 
				+        FirstSampleOfFrame(num_frames - 1, opts) + frame_length;
			
 
				+
			
 
				+    // the following code is optimized more for clarity than efficiency.
			
 
				+    // If flush == false, we can't output frames that extend past the end
			
 
				+    // of the signal.
			
 
				+    while (num_frames > 0 && end_sample_of_last_frame > num_samples) {
			
 
				+      num_frames--;
			
 
				+      end_sample_of_last_frame -= frame_shift;
			
 
				+    }
			
 
				+    return num_frames;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+void ExtractWindow(int64_t sample_offset, const float *wave, std::size_t wave_size,
			
 
				+                   int32_t f, const FrameExtractionOptions &opts,
			
 
				+                   const FeatureWindowFunction &window_function,
			
 
				+                   std::vector<float> *window,
			
 
				+                   float *log_energy_pre_window /*= nullptr*/) {
			
 
				+  KNF_CHECK(sample_offset >= 0 && wave_size != 0);
			
 
				+
			
 
				+  int32_t frame_length = opts.WindowSize();
			
 
				+  int32_t frame_length_padded = opts.PaddedWindowSize();
			
 
				+
			
 
				+  int64_t num_samples = sample_offset + wave_size;
			
 
				+  int64_t start_sample = FirstSampleOfFrame(f, opts);
			
 
				+  int64_t end_sample = start_sample + frame_length;
			
 
				+
			
 
				+  if (opts.snip_edges) {
			
 
				+    KNF_CHECK(start_sample >= sample_offset && end_sample <= num_samples);
			
 
				+  } else {
			
 
				+    KNF_CHECK(sample_offset == 0 || start_sample >= sample_offset);
			
 
				+  }
			
 
				+
			
 
				+  if (window->size() != frame_length_padded) {
			
 
				+    window->resize(frame_length_padded);
			
 
				+  }
			
 
				+
			
 
				+  // wave_start and wave_end are start and end indexes into 'wave', for the
			
 
				+  // piece of wave that we're trying to extract.
			
 
				+  int32_t wave_start = int32_t(start_sample - sample_offset);
			
 
				+  int32_t wave_end = wave_start + frame_length;
			
 
				+
			
 
				+  if (wave_start >= 0 && wave_end <= wave_size) {
			
 
				+    // the normal case-- no edge effects to consider.
			
 
				+    std::copy(wave + wave_start,
			
 
				+              wave + wave_start + frame_length, window->data());
			
 
				+  } else {
			
 
				+    // Deal with any end effects by reflection, if needed.  This code will only
			
 
				+    // be reached for about two frames per utterance, so we don't concern
			
 
				+    // ourselves excessively with efficiency.
			
 
				+    int32_t wave_dim = wave_size;
			
 
				+    for (int32_t s = 0; s < frame_length; ++s) {
			
 
				+      int32_t s_in_wave = s + wave_start;
			
 
				+      while (s_in_wave < 0 || s_in_wave >= wave_dim) {
			
 
				+        // reflect around the beginning or end of the wave.
			
 
				+        // e.g. -1 -> 0, -2 -> 1.
			
 
				+        // dim -> dim - 1, dim + 1 -> dim - 2.
			
 
				+        // the code supports repeated reflections, although this
			
 
				+        // would only be needed in pathological cases.
			
 
				+        if (s_in_wave < 0)
			
 
				+          s_in_wave = -s_in_wave - 1;
			
 
				+        else
			
 
				+          s_in_wave = 2 * wave_dim - 1 - s_in_wave;
			
 
				+      }
			
 
				+      (*window)[s] = wave[s_in_wave];
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  ProcessWindow(opts, window_function, window->data(), log_energy_pre_window);
			
 
				+}
			
 
				+
			
 
				+static void RemoveDcOffset(float *d, int32_t n) {
			
 
				+  float sum = 0;
			
 
				+  for (int32_t i = 0; i != n; ++i) {
			
 
				+    sum += d[i];
			
 
				+  }
			
 
				+
			
 
				+  float mean = sum / n;
			
 
				+
			
 
				+  for (int32_t i = 0; i != n; ++i) {
			
 
				+    d[i] -= mean;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+float InnerProduct(const float *a, const float *b, int32_t n) {
			
 
				+  float sum = 0;
			
 
				+  for (int32_t i = 0; i != n; ++i) {
			
 
				+    sum += a[i] * b[i];
			
 
				+  }
			
 
				+  return sum;
			
 
				+}
			
 
				+
			
 
				+static void Preemphasize(float *d, int32_t n, float preemph_coeff) {
			
 
				+  if (preemph_coeff == 0.0) {
			
 
				+    return;
			
 
				+  }
			
 
				+
			
 
				+  KNF_CHECK(preemph_coeff >= 0.0 && preemph_coeff <= 1.0);
			
 
				+
			
 
				+  for (int32_t i = n - 1; i > 0; --i) {
			
 
				+    d[i] -= preemph_coeff * d[i - 1];
			
 
				+  }
			
 
				+  d[0] -= preemph_coeff * d[0];
			
 
				+}
			
 
				+
			
 
				+void ProcessWindow(const FrameExtractionOptions &opts,
			
 
				+                   const FeatureWindowFunction &window_function, float *window,
			
 
				+                   float *log_energy_pre_window /*= nullptr*/) {
			
 
				+  int32_t frame_length = opts.WindowSize();
			
 
				+
			
 
				+  if (opts.remove_dc_offset) {
			
 
				+    RemoveDcOffset(window, frame_length);
			
 
				+  }
			
 
				+
			
 
				+  if (log_energy_pre_window != NULL) {
			
 
				+    float energy = std::max<float>(InnerProduct(window, window, frame_length),
			
 
				+                                   std::numeric_limits<float>::epsilon());
			
 
				+    *log_energy_pre_window = std::log(energy);
			
 
				+  }
			
 
				+
			
 
				+  if (opts.preemph_coeff != 0.0) {
			
 
				+    Preemphasize(window, frame_length, opts.preemph_coeff);
			
 
				+  }
			
 
				+
			
 
				+  window_function.Apply(window);
			
 
				+}
			
 
				+
			
 
				+}  // namespace knf
			
--- a/ggml/examples/kaldi-native-fbank/csrc/feature-window.h
+++ b/ggml/examples/kaldi-native-fbank/csrc/feature-window.h
@@ -0,0 +1,172 @@
 
				+// kaldi-native-fbank/csrc/feature-window.h
			
 
				+//
			
 
				+// Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
			
 
				+
			
 
				+// This file is copied/modified from kaldi/src/feat/feature-window.h
			
 
				+
			
 
				+#ifndef KALDI_NATIVE_FBANK_CSRC_FEATURE_WINDOW_H_
			
 
				+#define KALDI_NATIVE_FBANK_CSRC_FEATURE_WINDOW_H_
			
 
				+
			
 
				+#include <sstream>
			
 
				+#include <string>
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "log.h"
			
 
				+
			
 
				+namespace knf {
			
 
				+
			
 
				+inline int32_t RoundUpToNearestPowerOfTwo(int32_t n) {
			
 
				+  // copied from kaldi/src/base/kaldi-math.cc
			
 
				+  KNF_CHECK_GT(n, 0);
			
 
				+  n--;
			
 
				+  n |= n >> 1;
			
 
				+  n |= n >> 2;
			
 
				+  n |= n >> 4;
			
 
				+  n |= n >> 8;
			
 
				+  n |= n >> 16;
			
 
				+  return n + 1;
			
 
				+}
			
 
				+
			
 
				+struct FrameExtractionOptions {
			
 
				+  float samp_freq = 16000;
			
 
				+  float frame_shift_ms = 10.0f;   // in milliseconds.
			
 
				+  float frame_length_ms = 25.0f;  // in milliseconds.
			
 
				+  float dither = 1.0f;            // Amount of dithering, 0.0 means no dither.
			
 
				+  float preemph_coeff = 0.97f;    // Preemphasis coefficient.
			
 
				+  bool remove_dc_offset = true;   // Subtract mean of wave before FFT.
			
 
				+  std::string window_type = "povey";  // e.g. Hamming window
			
 
				+  // May be "hamming", "rectangular", "povey", "hanning", "sine", "blackman"
			
 
				+  // "povey" is a window I made to be similar to Hamming but to go to zero at
			
 
				+  // the edges, it's pow((0.5 - 0.5*cos(n/N*2*pi)), 0.85) I just don't think the
			
 
				+  // Hamming window makes sense as a windowing function.
			
 
				+  bool round_to_power_of_two = true;
			
 
				+  float blackman_coeff = 0.42f;
			
 
				+  bool snip_edges = true;
			
 
				+  // bool allow_downsample = false;
			
 
				+  // bool allow_upsample = false;
			
 
				+
			
 
				+  int32_t WindowShift() const {
			
 
				+    return static_cast<int32_t>(samp_freq * 0.001f * frame_shift_ms);
			
 
				+  }
			
 
				+  int32_t WindowSize() const {
			
 
				+    return static_cast<int32_t>(samp_freq * 0.001f * frame_length_ms);
			
 
				+  }
			
 
				+  int32_t PaddedWindowSize() const {
			
 
				+    return (round_to_power_of_two ? RoundUpToNearestPowerOfTwo(WindowSize())
			
 
				+                                  : WindowSize());
			
 
				+  }
			
 
				+  std::string ToString() const {
			
 
				+    std::ostringstream os;
			
 
				+#define KNF_PRINT(x) os << #x << ": " << x << "\n"
			
 
				+    KNF_PRINT(samp_freq);
			
 
				+    KNF_PRINT(frame_shift_ms);
			
 
				+    KNF_PRINT(frame_length_ms);
			
 
				+    KNF_PRINT(dither);
			
 
				+    KNF_PRINT(preemph_coeff);
			
 
				+    KNF_PRINT(remove_dc_offset);
			
 
				+    KNF_PRINT(window_type);
			
 
				+    KNF_PRINT(round_to_power_of_two);
			
 
				+    KNF_PRINT(blackman_coeff);
			
 
				+    KNF_PRINT(snip_edges);
			
 
				+    // KNF_PRINT(allow_downsample);
			
 
				+    // KNF_PRINT(allow_upsample);
			
 
				+#undef KNF_PRINT
			
 
				+    return os.str();
			
 
				+  }
			
 
				+};
			
 
				+
			
 
				+std::ostream &operator<<(std::ostream &os, const FrameExtractionOptions &opts);
			
 
				+
			
 
				+class FeatureWindowFunction {
			
 
				+ public:
			
 
				+  FeatureWindowFunction() = default;
			
 
				+  explicit FeatureWindowFunction(const FrameExtractionOptions &opts);
			
 
				+  /**
			
 
				+   * @param wave Pointer to a 1-D array of shape [window_size].
			
 
				+   *             It is modified in-place: wave[i] = wave[i] * window_[i].
			
 
				+   * @param
			
 
				+   */
			
 
				+  void Apply(float *wave) const;
			
 
				+
			
 
				+ private:
			
 
				+  std::vector<float> window_;  // of size opts.WindowSize()
			
 
				+};
			
 
				+
			
 
				+int64_t FirstSampleOfFrame(int32_t frame, const FrameExtractionOptions &opts);
			
 
				+
			
 
				+/**
			
 
				+   This function returns the number of frames that we can extract from a wave
			
 
				+   file with the given number of samples in it (assumed to have the same
			
 
				+   sampling rate as specified in 'opts').
			
 
				+
			
 
				+      @param [in] num_samples  The number of samples in the wave file.
			
 
				+      @param [in] opts     The frame-extraction options class
			
 
				+
			
 
				+      @param [in] flush   True if we are asserting that this number of samples
			
 
				+   is 'all there is', false if we expecting more data to possibly come in.  This
			
 
				+   only makes a difference to the answer
			
 
				+   if opts.snips_edges== false.  For offline feature extraction you always want
			
 
				+   flush == true.  In an online-decoding context, once you know (or decide) that
			
 
				+   no more data is coming in, you'd call it with flush == true at the end to
			
 
				+   flush out any remaining data.
			
 
				+*/
			
 
				+int32_t NumFrames(int64_t num_samples, const FrameExtractionOptions &opts,
			
 
				+                  bool flush = true);
			
 
				+
			
 
				+/*
			
 
				+  ExtractWindow() extracts a windowed frame of waveform (possibly with a
			
 
				+  power-of-two, padded size, depending on the config), including all the
			
 
				+  processing done by ProcessWindow().
			
 
				+
			
 
				+  @param [in] sample_offset  If 'wave' is not the entire waveform, but
			
 
				+                   part of it to the left has been discarded, then the
			
 
				+                   number of samples prior to 'wave' that we have
			
 
				+                   already discarded.  Set this to zero if you are
			
 
				+                   processing the entire waveform in one piece, or
			
 
				+                   if you get 'no matching function' compilation
			
 
				+                   errors when updating the code.
			
 
				+  @param [in] wave  The waveform
			
 
				+  @param [in] f     The frame index to be extracted, with
			
 
				+                    0 <= f < NumFrames(sample_offset + wave.Dim(), opts, true)
			
 
				+  @param [in] opts  The options class to be used
			
 
				+  @param [in] window_function  The windowing function, as derived from the
			
 
				+                    options class.
			
 
				+  @param [out] window  The windowed, possibly-padded waveform to be
			
 
				+                     extracted.  Will be resized as needed.
			
 
				+  @param [out] log_energy_pre_window  If non-NULL, the log-energy of
			
 
				+                   the signal prior to pre-emphasis and multiplying by
			
 
				+                   the windowing function will be written to here.
			
 
				+*/
			
 
				+void ExtractWindow(int64_t sample_offset, const float *wave, std::size_t wave_size,
			
 
				+                   int32_t f, const FrameExtractionOptions &opts,
			
 
				+                   const FeatureWindowFunction &window_function,
			
 
				+                   std::vector<float> *window,
			
 
				+                   float *log_energy_pre_window = nullptr);
			
 
				+
			
 
				+/**
			
 
				+  This function does all the windowing steps after actually
			
 
				+  extracting the windowed signal: depending on the
			
 
				+  configuration, it does dithering, dc offset removal,
			
 
				+  preemphasis, and multiplication by the windowing function.
			
 
				+   @param [in] opts  The options class to be used
			
 
				+   @param [in] window_function  The windowing function-- should have
			
 
				+                    been initialized using 'opts'.
			
 
				+   @param [in,out] window  A vector of size opts.WindowSize().  Note:
			
 
				+      it will typically be a sub-vector of a larger vector of size
			
 
				+      opts.PaddedWindowSize(), with the remaining samples zero,
			
 
				+      as the FFT code is more efficient if it operates on data with
			
 
				+      power-of-two size.
			
 
				+   @param [out]   log_energy_pre_window If non-NULL, then after dithering and
			
 
				+      DC offset removal, this function will write to this pointer the log of
			
 
				+      the total energy (i.e. sum-squared) of the frame.
			
 
				+ */
			
 
				+void ProcessWindow(const FrameExtractionOptions &opts,
			
 
				+                   const FeatureWindowFunction &window_function, float *window,
			
 
				+                   float *log_energy_pre_window = nullptr);
			
 
				+
			
 
				+// Compute the inner product of two vectors
			
 
				+float InnerProduct(const float *a, const float *b, int32_t n);
			
 
				+
			
 
				+}  // namespace knf
			
 
				+
			
 
				+#endif  // KALDI_NATIVE_FBANK_CSRC_FEATURE_WINDOW_H_
			
--- a/ggml/examples/kaldi-native-fbank/csrc/fftsg.c
+++ b/ggml/examples/kaldi-native-fbank/csrc/fftsg.c
@@ -0,0 +1,2975 @@
 
				+/* This file is copied from
			
 
				+ *
			
 
				+ * https://www.kurims.kyoto-u.ac.jp/~ooura/fft.html
			
 
				+ *
			
 
				+ * Copyright Takuya OOURA, 1996-2001
			
 
				+ *
			
 
				+ * You may use, copy, modify and distribute this code for any
			
 
				+ * purpose (include commercial use) and without fee. Please refer to
			
 
				+ * this package when you modify this code.
			
 
				+ */
			
 
				+/*
			
 
				+Fast Fourier/Cosine/Sine Transform
			
 
				+    dimension   :one
			
 
				+    data length :power of 2
			
 
				+    decimation  :frequency
			
 
				+    radix       :split-radix
			
 
				+    data        :inplace
			
 
				+    table       :use
			
 
				+functions
			
 
				+    cdft: Complex Discrete Fourier Transform
			
 
				+    rdft: Real Discrete Fourier Transform
			
 
				+    ddct: Discrete Cosine Transform
			
 
				+    ddst: Discrete Sine Transform
			
 
				+    dfct: Cosine Transform of RDFT (Real Symmetric DFT)
			
 
				+    dfst: Sine Transform of RDFT (Real Anti-symmetric DFT)
			
 
				+function prototypes
			
 
				+    void cdft(int, int, double *, int *, double *);
			
 
				+    void rdft(int, int, double *, int *, double *);
			
 
				+    void ddct(int, int, double *, int *, double *);
			
 
				+    void ddst(int, int, double *, int *, double *);
			
 
				+    void dfct(int, double *, double *, int *, double *);
			
 
				+    void dfst(int, double *, double *, int *, double *);
			
 
				+macro definitions
			
 
				+    USE_CDFT_PTHREADS : default=not defined
			
 
				+        CDFT_THREADS_BEGIN_N  : must be >= 512, default=8192
			
 
				+        CDFT_4THREADS_BEGIN_N : must be >= 512, default=65536
			
 
				+    USE_CDFT_WINTHREADS : default=not defined
			
 
				+        CDFT_THREADS_BEGIN_N  : must be >= 512, default=32768
			
 
				+        CDFT_4THREADS_BEGIN_N : must be >= 512, default=524288
			
 
				+
			
 
				+
			
 
				+-------- Complex DFT (Discrete Fourier Transform) --------
			
 
				+    [definition]
			
 
				+        <case1>
			
 
				+            X[k] = sum_j=0^n-1 x[j]*exp(2*pi*i*j*k/n), 0<=k<n
			
 
				+        <case2>
			
 
				+            X[k] = sum_j=0^n-1 x[j]*exp(-2*pi*i*j*k/n), 0<=k<n
			
 
				+        (notes: sum_j=0^n-1 is a summation from j=0 to n-1)
			
 
				+    [usage]
			
 
				+        <case1>
			
 
				+            ip[0] = 0; // first time only
			
 
				+            cdft(2*n, 1, a, ip, w);
			
 
				+        <case2>
			
 
				+            ip[0] = 0; // first time only
			
 
				+            cdft(2*n, -1, a, ip, w);
			
 
				+    [parameters]
			
 
				+        2*n            :data length (int)
			
 
				+                        n >= 1, n = power of 2
			
 
				+        a[0...2*n-1]   :input/output data (double *)
			
 
				+                        input data
			
 
				+                            a[2*j] = Re(x[j]),
			
 
				+                            a[2*j+1] = Im(x[j]), 0<=j<n
			
 
				+                        output data
			
 
				+                            a[2*k] = Re(X[k]),
			
 
				+                            a[2*k+1] = Im(X[k]), 0<=k<n
			
 
				+        ip[0...*]      :work area for bit reversal (int *)
			
 
				+                        length of ip >= 2+sqrt(n)
			
 
				+                        strictly,
			
 
				+                        length of ip >=
			
 
				+                            2+(1<<(int)(log(n+0.5)/log(2))/2).
			
 
				+                        ip[0],ip[1] are pointers of the cos/sin table.
			
 
				+        w[0...n/2-1]   :cos/sin table (double *)
			
 
				+                        w[],ip[] are initialized if ip[0] == 0.
			
 
				+    [remark]
			
 
				+        Inverse of
			
 
				+            cdft(2*n, -1, a, ip, w);
			
 
				+        is
			
 
				+            cdft(2*n, 1, a, ip, w);
			
 
				+            for (j = 0; j <= 2 * n - 1; j++) {
			
 
				+                a[j] *= 1.0 / n;
			
 
				+            }
			
 
				+        .
			
 
				+
			
 
				+
			
 
				+-------- Real DFT / Inverse of Real DFT --------
			
 
				+    [definition]
			
 
				+        <case1> RDFT
			
 
				+            R[k] = sum_j=0^n-1 a[j]*cos(2*pi*j*k/n), 0<=k<=n/2
			
 
				+            I[k] = sum_j=0^n-1 a[j]*sin(2*pi*j*k/n), 0<k<n/2
			
 
				+        <case2> IRDFT (excluding scale)
			
 
				+            a[k] = (R[0] + R[n/2]*cos(pi*k))/2 +
			
 
				+                   sum_j=1^n/2-1 R[j]*cos(2*pi*j*k/n) +
			
 
				+                   sum_j=1^n/2-1 I[j]*sin(2*pi*j*k/n), 0<=k<n
			
 
				+    [usage]
			
 
				+        <case1>
			
 
				+            ip[0] = 0; // first time only
			
 
				+            rdft(n, 1, a, ip, w);
			
 
				+        <case2>
			
 
				+            ip[0] = 0; // first time only
			
 
				+            rdft(n, -1, a, ip, w);
			
 
				+    [parameters]
			
 
				+        n              :data length (int)
			
 
				+                        n >= 2, n = power of 2
			
 
				+        a[0...n-1]     :input/output data (double *)
			
 
				+                        <case1>
			
 
				+                            output data
			
 
				+                                a[2*k] = R[k], 0<=k<n/2
			
 
				+                                a[2*k+1] = I[k], 0<k<n/2
			
 
				+                                a[1] = R[n/2]
			
 
				+                        <case2>
			
 
				+                            input data
			
 
				+                                a[2*j] = R[j], 0<=j<n/2
			
 
				+                                a[2*j+1] = I[j], 0<j<n/2
			
 
				+                                a[1] = R[n/2]
			
 
				+        ip[0...*]      :work area for bit reversal (int *)
			
 
				+                        length of ip >= 2+sqrt(n/2)
			
 
				+                        strictly,
			
 
				+                        length of ip >=
			
 
				+                            2+(1<<(int)(log(n/2+0.5)/log(2))/2).
			
 
				+                        ip[0],ip[1] are pointers of the cos/sin table.
			
 
				+        w[0...n/2-1]   :cos/sin table (double *)
			
 
				+                        w[],ip[] are initialized if ip[0] == 0.
			
 
				+    [remark]
			
 
				+        Inverse of
			
 
				+            rdft(n, 1, a, ip, w);
			
 
				+        is
			
 
				+            rdft(n, -1, a, ip, w);
			
 
				+            for (j = 0; j <= n - 1; j++) {
			
 
				+                a[j] *= 2.0 / n;
			
 
				+            }
			
 
				+        .
			
 
				+
			
 
				+
			
 
				+-------- DCT (Discrete Cosine Transform) / Inverse of DCT --------
			
 
				+    [definition]
			
 
				+        <case1> IDCT (excluding scale)
			
 
				+            C[k] = sum_j=0^n-1 a[j]*cos(pi*j*(k+1/2)/n), 0<=k<n
			
 
				+        <case2> DCT
			
 
				+            C[k] = sum_j=0^n-1 a[j]*cos(pi*(j+1/2)*k/n), 0<=k<n
			
 
				+    [usage]
			
 
				+        <case1>
			
 
				+            ip[0] = 0; // first time only
			
 
				+            ddct(n, 1, a, ip, w);
			
 
				+        <case2>
			
 
				+            ip[0] = 0; // first time only
			
 
				+            ddct(n, -1, a, ip, w);
			
 
				+    [parameters]
			
 
				+        n              :data length (int)
			
 
				+                        n >= 2, n = power of 2
			
 
				+        a[0...n-1]     :input/output data (double *)
			
 
				+                        output data
			
 
				+                            a[k] = C[k], 0<=k<n
			
 
				+        ip[0...*]      :work area for bit reversal (int *)
			
 
				+                        length of ip >= 2+sqrt(n/2)
			
 
				+                        strictly,
			
 
				+                        length of ip >=
			
 
				+                            2+(1<<(int)(log(n/2+0.5)/log(2))/2).
			
 
				+                        ip[0],ip[1] are pointers of the cos/sin table.
			
 
				+        w[0...n*5/4-1] :cos/sin table (double *)
			
 
				+                        w[],ip[] are initialized if ip[0] == 0.
			
 
				+    [remark]
			
 
				+        Inverse of
			
 
				+            ddct(n, -1, a, ip, w);
			
 
				+        is
			
 
				+            a[0] *= 0.5;
			
 
				+            ddct(n, 1, a, ip, w);
			
 
				+            for (j = 0; j <= n - 1; j++) {
			
 
				+                a[j] *= 2.0 / n;
			
 
				+            }
			
 
				+        .
			
 
				+
			
 
				+
			
 
				+-------- DST (Discrete Sine Transform) / Inverse of DST --------
			
 
				+    [definition]
			
 
				+        <case1> IDST (excluding scale)
			
 
				+            S[k] = sum_j=1^n A[j]*sin(pi*j*(k+1/2)/n), 0<=k<n
			
 
				+        <case2> DST
			
 
				+            S[k] = sum_j=0^n-1 a[j]*sin(pi*(j+1/2)*k/n), 0<k<=n
			
 
				+    [usage]
			
 
				+        <case1>
			
 
				+            ip[0] = 0; // first time only
			
 
				+            ddst(n, 1, a, ip, w);
			
 
				+        <case2>
			
 
				+            ip[0] = 0; // first time only
			
 
				+            ddst(n, -1, a, ip, w);
			
 
				+    [parameters]
			
 
				+        n              :data length (int)
			
 
				+                        n >= 2, n = power of 2
			
 
				+        a[0...n-1]     :input/output data (double *)
			
 
				+                        <case1>
			
 
				+                            input data
			
 
				+                                a[j] = A[j], 0<j<n
			
 
				+                                a[0] = A[n]
			
 
				+                            output data
			
 
				+                                a[k] = S[k], 0<=k<n
			
 
				+                        <case2>
			
 
				+                            output data
			
 
				+                                a[k] = S[k], 0<k<n
			
 
				+                                a[0] = S[n]
			
 
				+        ip[0...*]      :work area for bit reversal (int *)
			
 
				+                        length of ip >= 2+sqrt(n/2)
			
 
				+                        strictly,
			
 
				+                        length of ip >=
			
 
				+                            2+(1<<(int)(log(n/2+0.5)/log(2))/2).
			
 
				+                        ip[0],ip[1] are pointers of the cos/sin table.
			
 
				+        w[0...n*5/4-1] :cos/sin table (double *)
			
 
				+                        w[],ip[] are initialized if ip[0] == 0.
			
 
				+    [remark]
			
 
				+        Inverse of
			
 
				+            ddst(n, -1, a, ip, w);
			
 
				+        is
			
 
				+            a[0] *= 0.5;
			
 
				+            ddst(n, 1, a, ip, w);
			
 
				+            for (j = 0; j <= n - 1; j++) {
			
 
				+                a[j] *= 2.0 / n;
			
 
				+            }
			
 
				+        .
			
 
				+
			
 
				+
			
 
				+-------- Cosine Transform of RDFT (Real Symmetric DFT) --------
			
 
				+    [definition]
			
 
				+        C[k] = sum_j=0^n a[j]*cos(pi*j*k/n), 0<=k<=n
			
 
				+    [usage]
			
 
				+        ip[0] = 0; // first time only
			
 
				+        dfct(n, a, t, ip, w);
			
 
				+    [parameters]
			
 
				+        n              :data length - 1 (int)
			
 
				+                        n >= 2, n = power of 2
			
 
				+        a[0...n]       :input/output data (double *)
			
 
				+                        output data
			
 
				+                            a[k] = C[k], 0<=k<=n
			
 
				+        t[0...n/2]     :work area (double *)
			
 
				+        ip[0...*]      :work area for bit reversal (int *)
			
 
				+                        length of ip >= 2+sqrt(n/4)
			
 
				+                        strictly,
			
 
				+                        length of ip >=
			
 
				+                            2+(1<<(int)(log(n/4+0.5)/log(2))/2).
			
 
				+                        ip[0],ip[1] are pointers of the cos/sin table.
			
 
				+        w[0...n*5/8-1] :cos/sin table (double *)
			
 
				+                        w[],ip[] are initialized if ip[0] == 0.
			
 
				+    [remark]
			
 
				+        Inverse of
			
 
				+            a[0] *= 0.5;
			
 
				+            a[n] *= 0.5;
			
 
				+            dfct(n, a, t, ip, w);
			
 
				+        is
			
 
				+            a[0] *= 0.5;
			
 
				+            a[n] *= 0.5;
			
 
				+            dfct(n, a, t, ip, w);
			
 
				+            for (j = 0; j <= n; j++) {
			
 
				+                a[j] *= 2.0 / n;
			
 
				+            }
			
 
				+        .
			
 
				+
			
 
				+
			
 
				+-------- Sine Transform of RDFT (Real Anti-symmetric DFT) --------
			
 
				+    [definition]
			
 
				+        S[k] = sum_j=1^n-1 a[j]*sin(pi*j*k/n), 0<k<n
			
 
				+    [usage]
			
 
				+        ip[0] = 0; // first time only
			
 
				+        dfst(n, a, t, ip, w);
			
 
				+    [parameters]
			
 
				+        n              :data length + 1 (int)
			
 
				+                        n >= 2, n = power of 2
			
 
				+        a[0...n-1]     :input/output data (double *)
			
 
				+                        output data
			
 
				+                            a[k] = S[k], 0<k<n
			
 
				+                        (a[0] is used for work area)
			
 
				+        t[0...n/2-1]   :work area (double *)
			
 
				+        ip[0...*]      :work area for bit reversal (int *)
			
 
				+                        length of ip >= 2+sqrt(n/4)
			
 
				+                        strictly,
			
 
				+                        length of ip >=
			
 
				+                            2+(1<<(int)(log(n/4+0.5)/log(2))/2).
			
 
				+                        ip[0],ip[1] are pointers of the cos/sin table.
			
 
				+        w[0...n*5/8-1] :cos/sin table (double *)
			
 
				+                        w[],ip[] are initialized if ip[0] == 0.
			
 
				+    [remark]
			
 
				+        Inverse of
			
 
				+            dfst(n, a, t, ip, w);
			
 
				+        is
			
 
				+            dfst(n, a, t, ip, w);
			
 
				+            for (j = 1; j <= n - 1; j++) {
			
 
				+                a[j] *= 2.0 / n;
			
 
				+            }
			
 
				+        .
			
 
				+
			
 
				+
			
 
				+Appendix :
			
 
				+    The cos/sin table is recalculated when the larger table required.
			
 
				+    w[] and ip[] are compatible with all routines.
			
 
				+*/
			
 
				+
			
 
				+
			
 
				+
			
 
				+void rdft(int n, int isgn, double *a, int *ip, double *w)
			
 
				+{
			
 
				+    void makewt(int nw, int *ip, double *w);
			
 
				+    void makect(int nc, int *ip, double *c);
			
 
				+    void cftfsub(int n, double *a, int *ip, int nw, double *w);
			
 
				+    void cftbsub(int n, double *a, int *ip, int nw, double *w);
			
 
				+    void rftfsub(int n, double *a, int nc, double *c);
			
 
				+    void rftbsub(int n, double *a, int nc, double *c);
			
 
				+    int nw, nc;
			
 
				+    double xi;
			
 
				+
			
 
				+    nw = ip[0];
			
 
				+    if (n > (nw << 2)) {
			
 
				+        nw = n >> 2;
			
 
				+        makewt(nw, ip, w);
			
 
				+    }
			
 
				+    nc = ip[1];
			
 
				+    if (n > (nc << 2)) {
			
 
				+        nc = n >> 2;
			
 
				+        makect(nc, ip, w + nw);
			
 
				+    }
			
 
				+    if (isgn >= 0) {
			
 
				+        if (n > 4) {
			
 
				+            cftfsub(n, a, ip, nw, w);
			
 
				+            rftfsub(n, a, nc, w + nw);
			
 
				+        } else if (n == 4) {
			
 
				+            cftfsub(n, a, ip, nw, w);
			
 
				+        }
			
 
				+        xi = a[0] - a[1];
			
 
				+        a[0] += a[1];
			
 
				+        a[1] = xi;
			
 
				+    } else {
			
 
				+        a[1] = 0.5 * (a[0] - a[1]);
			
 
				+        a[0] -= a[1];
			
 
				+        if (n > 4) {
			
 
				+            rftbsub(n, a, nc, w + nw);
			
 
				+            cftbsub(n, a, ip, nw, w);
			
 
				+        } else if (n == 4) {
			
 
				+            cftbsub(n, a, ip, nw, w);
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/* -------- initializing routines -------- */
			
 
				+
			
 
				+
			
 
				+#include <math.h>
			
 
				+
			
 
				+void makewt(int nw, int *ip, double *w)
			
 
				+{
			
 
				+    void makeipt(int nw, int *ip);
			
 
				+    int j, nwh, nw0, nw1;
			
 
				+    double delta, wn4r, wk1r, wk1i, wk3r, wk3i;
			
 
				+
			
 
				+    ip[0] = nw;
			
 
				+    ip[1] = 1;
			
 
				+    if (nw > 2) {
			
 
				+        nwh = nw >> 1;
			
 
				+        delta = atan(1.0) / nwh;
			
 
				+        wn4r = cos(delta * nwh);
			
 
				+        w[0] = 1;
			
 
				+        w[1] = wn4r;
			
 
				+        if (nwh == 4) {
			
 
				+            w[2] = cos(delta * 2);
			
 
				+            w[3] = sin(delta * 2);
			
 
				+        } else if (nwh > 4) {
			
 
				+            makeipt(nw, ip);
			
 
				+            w[2] = 0.5 / cos(delta * 2);
			
 
				+            w[3] = 0.5 / cos(delta * 6);
			
 
				+            for (j = 4; j < nwh; j += 4) {
			
 
				+                w[j] = cos(delta * j);
			
 
				+                w[j + 1] = sin(delta * j);
			
 
				+                w[j + 2] = cos(3 * delta * j);
			
 
				+                w[j + 3] = -sin(3 * delta * j);
			
 
				+            }
			
 
				+        }
			
 
				+        nw0 = 0;
			
 
				+        while (nwh > 2) {
			
 
				+            nw1 = nw0 + nwh;
			
 
				+            nwh >>= 1;
			
 
				+            w[nw1] = 1;
			
 
				+            w[nw1 + 1] = wn4r;
			
 
				+            if (nwh == 4) {
			
 
				+                wk1r = w[nw0 + 4];
			
 
				+                wk1i = w[nw0 + 5];
			
 
				+                w[nw1 + 2] = wk1r;
			
 
				+                w[nw1 + 3] = wk1i;
			
 
				+            } else if (nwh > 4) {
			
 
				+                wk1r = w[nw0 + 4];
			
 
				+                wk3r = w[nw0 + 6];
			
 
				+                w[nw1 + 2] = 0.5 / wk1r;
			
 
				+                w[nw1 + 3] = 0.5 / wk3r;
			
 
				+                for (j = 4; j < nwh; j += 4) {
			
 
				+                    wk1r = w[nw0 + 2 * j];
			
 
				+                    wk1i = w[nw0 + 2 * j + 1];
			
 
				+                    wk3r = w[nw0 + 2 * j + 2];
			
 
				+                    wk3i = w[nw0 + 2 * j + 3];
			
 
				+                    w[nw1 + j] = wk1r;
			
 
				+                    w[nw1 + j + 1] = wk1i;
			
 
				+                    w[nw1 + j + 2] = wk3r;
			
 
				+                    w[nw1 + j + 3] = wk3i;
			
 
				+                }
			
 
				+            }
			
 
				+            nw0 = nw1;
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void makeipt(int nw, int *ip)
			
 
				+{
			
 
				+    int j, l, m, m2, p, q;
			
 
				+
			
 
				+    ip[2] = 0;
			
 
				+    ip[3] = 16;
			
 
				+    m = 2;
			
 
				+    for (l = nw; l > 32; l >>= 2) {
			
 
				+        m2 = m << 1;
			
 
				+        q = m2 << 3;
			
 
				+        for (j = m; j < m2; j++) {
			
 
				+            p = ip[j] << 2;
			
 
				+            ip[m + j] = p;
			
 
				+            ip[m2 + j] = p + q;
			
 
				+        }
			
 
				+        m = m2;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void makect(int nc, int *ip, double *c)
			
 
				+{
			
 
				+    int j, nch;
			
 
				+    double delta;
			
 
				+
			
 
				+    ip[1] = nc;
			
 
				+    if (nc > 1) {
			
 
				+        nch = nc >> 1;
			
 
				+        delta = atan(1.0) / nch;
			
 
				+        c[0] = cos(delta * nch);
			
 
				+        c[nch] = 0.5 * c[0];
			
 
				+        for (j = 1; j < nch; j++) {
			
 
				+            c[j] = 0.5 * cos(delta * j);
			
 
				+            c[nc - j] = 0.5 * sin(delta * j);
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/* -------- child routines -------- */
			
 
				+
			
 
				+
			
 
				+#ifdef USE_CDFT_PTHREADS
			
 
				+#define USE_CDFT_THREADS
			
 
				+#ifndef CDFT_THREADS_BEGIN_N
			
 
				+#define CDFT_THREADS_BEGIN_N 8192
			
 
				+#endif
			
 
				+#ifndef CDFT_4THREADS_BEGIN_N
			
 
				+#define CDFT_4THREADS_BEGIN_N 65536
			
 
				+#endif
			
 
				+#include <pthread.h>
			
 
				+#include <stdio.h>
			
 
				+#include <stdlib.h>
			
 
				+#define cdft_thread_t pthread_t
			
 
				+#define cdft_thread_create(thp,func,argp) { \
			
 
				+    if (pthread_create(thp, NULL, func, (void *) argp) != 0) { \
			
 
				+        fprintf(stderr, "cdft thread error\n"); \
			
 
				+        exit(1); \
			
 
				+    } \
			
 
				+}
			
 
				+#define cdft_thread_wait(th) { \
			
 
				+    if (pthread_join(th, NULL) != 0) { \
			
 
				+        fprintf(stderr, "cdft thread error\n"); \
			
 
				+        exit(1); \
			
 
				+    } \
			
 
				+}
			
 
				+#endif /* USE_CDFT_PTHREADS */
			
 
				+
			
 
				+
			
 
				+#ifdef USE_CDFT_WINTHREADS
			
 
				+#define USE_CDFT_THREADS
			
 
				+#ifndef CDFT_THREADS_BEGIN_N
			
 
				+#define CDFT_THREADS_BEGIN_N 32768
			
 
				+#endif
			
 
				+#ifndef CDFT_4THREADS_BEGIN_N
			
 
				+#define CDFT_4THREADS_BEGIN_N 524288
			
 
				+#endif
			
 
				+#include <windows.h>
			
 
				+#include <stdio.h>
			
 
				+#include <stdlib.h>
			
 
				+#define cdft_thread_t HANDLE
			
 
				+#define cdft_thread_create(thp,func,argp) { \
			
 
				+    DWORD thid; \
			
 
				+    *(thp) = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, (LPVOID) argp, 0, &thid); \
			
 
				+    if (*(thp) == 0) { \
			
 
				+        fprintf(stderr, "cdft thread error\n"); \
			
 
				+        exit(1); \
			
 
				+    } \
			
 
				+}
			
 
				+#define cdft_thread_wait(th) { \
			
 
				+    WaitForSingleObject(th, INFINITE); \
			
 
				+    CloseHandle(th); \
			
 
				+}
			
 
				+#endif /* USE_CDFT_WINTHREADS */
			
 
				+
			
 
				+
			
 
				+void cftfsub(int n, double *a, int *ip, int nw, double *w)
			
 
				+{
			
 
				+    void bitrv2(int n, int *ip, double *a);
			
 
				+    void bitrv216(double *a);
			
 
				+    void bitrv208(double *a);
			
 
				+    void cftf1st(int n, double *a, double *w);
			
 
				+    void cftrec4(int n, double *a, int nw, double *w);
			
 
				+    void cftleaf(int n, int isplt, double *a, int nw, double *w);
			
 
				+    void cftfx41(int n, double *a, int nw, double *w);
			
 
				+    void cftf161(double *a, double *w);
			
 
				+    void cftf081(double *a, double *w);
			
 
				+    void cftf040(double *a);
			
 
				+    void cftx020(double *a);
			
 
				+#ifdef USE_CDFT_THREADS
			
 
				+    void cftrec4_th(int n, double *a, int nw, double *w);
			
 
				+#endif /* USE_CDFT_THREADS */
			
 
				+
			
 
				+    if (n > 8) {
			
 
				+        if (n > 32) {
			
 
				+            cftf1st(n, a, &w[nw - (n >> 2)]);
			
 
				+#ifdef USE_CDFT_THREADS
			
 
				+            if (n > CDFT_THREADS_BEGIN_N) {
			
 
				+                cftrec4_th(n, a, nw, w);
			
 
				+            } else
			
 
				+#endif /* USE_CDFT_THREADS */
			
 
				+            if (n > 512) {
			
 
				+                cftrec4(n, a, nw, w);
			
 
				+            } else if (n > 128) {
			
 
				+                cftleaf(n, 1, a, nw, w);
			
 
				+            } else {
			
 
				+                cftfx41(n, a, nw, w);
			
 
				+            }
			
 
				+            bitrv2(n, ip, a);
			
 
				+        } else if (n == 32) {
			
 
				+            cftf161(a, &w[nw - 8]);
			
 
				+            bitrv216(a);
			
 
				+        } else {
			
 
				+            cftf081(a, w);
			
 
				+            bitrv208(a);
			
 
				+        }
			
 
				+    } else if (n == 8) {
			
 
				+        cftf040(a);
			
 
				+    } else if (n == 4) {
			
 
				+        cftx020(a);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void cftbsub(int n, double *a, int *ip, int nw, double *w)
			
 
				+{
			
 
				+    void bitrv2conj(int n, int *ip, double *a);
			
 
				+    void bitrv216neg(double *a);
			
 
				+    void bitrv208neg(double *a);
			
 
				+    void cftb1st(int n, double *a, double *w);
			
 
				+    void cftrec4(int n, double *a, int nw, double *w);
			
 
				+    void cftleaf(int n, int isplt, double *a, int nw, double *w);
			
 
				+    void cftfx41(int n, double *a, int nw, double *w);
			
 
				+    void cftf161(double *a, double *w);
			
 
				+    void cftf081(double *a, double *w);
			
 
				+    void cftb040(double *a);
			
 
				+    void cftx020(double *a);
			
 
				+#ifdef USE_CDFT_THREADS
			
 
				+    void cftrec4_th(int n, double *a, int nw, double *w);
			
 
				+#endif /* USE_CDFT_THREADS */
			
 
				+
			
 
				+    if (n > 8) {
			
 
				+        if (n > 32) {
			
 
				+            cftb1st(n, a, &w[nw - (n >> 2)]);
			
 
				+#ifdef USE_CDFT_THREADS
			
 
				+            if (n > CDFT_THREADS_BEGIN_N) {
			
 
				+                cftrec4_th(n, a, nw, w);
			
 
				+            } else
			
 
				+#endif /* USE_CDFT_THREADS */
			
 
				+            if (n > 512) {
			
 
				+                cftrec4(n, a, nw, w);
			
 
				+            } else if (n > 128) {
			
 
				+                cftleaf(n, 1, a, nw, w);
			
 
				+            } else {
			
 
				+                cftfx41(n, a, nw, w);
			
 
				+            }
			
 
				+            bitrv2conj(n, ip, a);
			
 
				+        } else if (n == 32) {
			
 
				+            cftf161(a, &w[nw - 8]);
			
 
				+            bitrv216neg(a);
			
 
				+        } else {
			
 
				+            cftf081(a, w);
			
 
				+            bitrv208neg(a);
			
 
				+        }
			
 
				+    } else if (n == 8) {
			
 
				+        cftb040(a);
			
 
				+    } else if (n == 4) {
			
 
				+        cftx020(a);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void bitrv2(int n, int *ip, double *a)
			
 
				+{
			
 
				+    int j, j1, k, k1, l, m, nh, nm;
			
 
				+    double xr, xi, yr, yi;
			
 
				+
			
 
				+    m = 1;
			
 
				+    for (l = n >> 2; l > 8; l >>= 2) {
			
 
				+        m <<= 1;
			
 
				+    }
			
 
				+    nh = n >> 1;
			
 
				+    nm = 4 * m;
			
 
				+    if (l == 8) {
			
 
				+        for (k = 0; k < m; k++) {
			
 
				+            for (j = 0; j < k; j++) {
			
 
				+                j1 = 4 * j + 2 * ip[m + k];
			
 
				+                k1 = 4 * k + 2 * ip[m + j];
			
 
				+                xr = a[j1];
			
 
				+                xi = a[j1 + 1];
			
 
				+                yr = a[k1];
			
 
				+                yi = a[k1 + 1];
			
 
				+                a[j1] = yr;
			
 
				+                a[j1 + 1] = yi;
			
 
				+                a[k1] = xr;
			
 
				+                a[k1 + 1] = xi;
			
 
				+                j1 += nm;
			
 
				+                k1 += 2 * nm;
			
 
				+                xr = a[j1];
			
 
				+                xi = a[j1 + 1];
			
 
				+                yr = a[k1];
			
 
				+                yi = a[k1 + 1];
			
 
				+                a[j1] = yr;
			
 
				+                a[j1 + 1] = yi;
			
 
				+                a[k1] = xr;
			
 
				+                a[k1 + 1] = xi;
			
 
				+                j1 += nm;
			
 
				+                k1 -= nm;
			
 
				+                xr = a[j1];
			
 
				+                xi = a[j1 + 1];
			
 
				+                yr = a[k1];
			
 
				+                yi = a[k1 + 1];
			
 
				+                a[j1] = yr;
			
 
				+                a[j1 + 1] = yi;
			
 
				+                a[k1] = xr;
			
 
				+                a[k1 + 1] = xi;
			
 
				+                j1 += nm;
			
 
				+                k1 += 2 * nm;
			
 
				+                xr = a[j1];
			
 
				+                xi = a[j1 + 1];
			
 
				+                yr = a[k1];
			
 
				+                yi = a[k1 + 1];
			
 
				+                a[j1] = yr;
			
 
				+                a[j1 + 1] = yi;
			
 
				+                a[k1] = xr;
			
 
				+                a[k1 + 1] = xi;
			
 
				+                j1 += nh;
			
 
				+                k1 += 2;
			
 
				+                xr = a[j1];
			
 
				+                xi = a[j1 + 1];
			
 
				+                yr = a[k1];
			
 
				+                yi = a[k1 + 1];
			
 
				+                a[j1] = yr;
			
 
				+                a[j1 + 1] = yi;
			
 
				+                a[k1] = xr;
			
 
				+                a[k1 + 1] = xi;
			
 
				+                j1 -= nm;
			
 
				+                k1 -= 2 * nm;
			
 
				+                xr = a[j1];
			
 
				+                xi = a[j1 + 1];
			
 
				+                yr = a[k1];
			
 
				+                yi = a[k1 + 1];
			
 
				+                a[j1] = yr;
			
 
				+                a[j1 + 1] = yi;
			
 
				+                a[k1] = xr;
			
 
				+                a[k1 + 1] = xi;
			
 
				+                j1 -= nm;
			
 
				+                k1 += nm;
			
 
				+                xr = a[j1];
			
 
				+                xi = a[j1 + 1];
			
 
				+                yr = a[k1];
			
 
				+                yi = a[k1 + 1];
			
 
				+                a[j1] = yr;
			
 
				+                a[j1 + 1] = yi;
			
 
				+                a[k1] = xr;
			
 
				+                a[k1 + 1] = xi;
			
 
				+                j1 -= nm;
			
 
				+                k1 -= 2 * nm;
			
 
				+                xr = a[j1];
			
 
				+                xi = a[j1 + 1];
			
 
				+                yr = a[k1];
			
 
				+                yi = a[k1 + 1];
			
 
				+                a[j1] = yr;
			
 
				+                a[j1 + 1] = yi;
			
 
				+                a[k1] = xr;
			
 
				+                a[k1 + 1] = xi;
			
 
				+                j1 += 2;
			
 
				+                k1 += nh;
			
 
				+                xr = a[j1];
			
 
				+                xi = a[j1 + 1];
			
 
				+                yr = a[k1];
			
 
				+                yi = a[k1 + 1];
			
 
				+                a[j1] = yr;
			
 
				+                a[j1 + 1] = yi;
			
 
				+                a[k1] = xr;
			
 
				+                a[k1 + 1] = xi;
			
 
				+                j1 += nm;
			
 
				+                k1 += 2 * nm;
			
 
				+                xr = a[j1];
			
 
				+                xi = a[j1 + 1];
			
 
				+                yr = a[k1];
			
 
				+                yi = a[k1 + 1];
			
 
				+                a[j1] = yr;
			
 
				+                a[j1 + 1] = yi;
			
 
				+                a[k1] = xr;
			
 
				+                a[k1 + 1] = xi;
			
 
				+                j1 += nm;
			
 
				+                k1 -= nm;
			
 
				+                xr = a[j1];
			
 
				+                xi = a[j1 + 1];
			
 
				+                yr = a[k1];
			
 
				+                yi = a[k1 + 1];
			
 
				+                a[j1] = yr;
			
 
				+                a[j1 + 1] = yi;
			
 
				+                a[k1] = xr;
			
 
				+                a[k1 + 1] = xi;
			
 
				+                j1 += nm;
			
 
				+                k1 += 2 * nm;
			
 
				+                xr = a[j1];
			
 
				+                xi = a[j1 + 1];
			
 
				+                yr = a[k1];
			
 
				+                yi = a[k1 + 1];
			
 
				+                a[j1] = yr;
			
 
				+                a[j1 + 1] = yi;
			
 
				+                a[k1] = xr;
			
 
				+                a[k1 + 1] = xi;
			
 
				+                j1 -= nh;
			
 
				+                k1 -= 2;
			
 
				+                xr = a[j1];
			
 
				+                xi = a[j1 + 1];
			
 
				+                yr = a[k1];
			
 
				+                yi = a[k1 + 1];
			
 
				+                a[j1] = yr;
			
 
				+                a[j1 + 1] = yi;
			
 
				+                a[k1] = xr;
			
 
				+                a[k1 + 1] = xi;
			
 
				+                j1 -= nm;
			
 
				+                k1 -= 2 * nm;
			
 
				+                xr = a[j1];
			
 
				+                xi = a[j1 + 1];
			
 
				+                yr = a[k1];
			
 
				+                yi = a[k1 + 1];
			
 
				+                a[j1] = yr;
			
 
				+                a[j1 + 1] = yi;
			
 
				+                a[k1] = xr;
			
 
				+                a[k1 + 1] = xi;
			
 
				+                j1 -= nm;
			
 
				+                k1 += nm;
			
 
				+                xr = a[j1];
			
 
				+                xi = a[j1 + 1];
			
 
				+                yr = a[k1];
			
 
				+                yi = a[k1 + 1];
			
 
				+                a[j1] = yr;
			
 
				+                a[j1 + 1] = yi;
			
 
				+                a[k1] = xr;
			
 
				+                a[k1 + 1] = xi;
			
 
				+                j1 -= nm;
			
 
				+                k1 -= 2 * nm;
			
 
				+                xr = a[j1];
			
 
				+                xi = a[j1 + 1];
			
 
				+                yr = a[k1];
			
 
				+                yi = a[k1 + 1];
			
 
				+                a[j1] = yr;
			
 
				+                a[j1 + 1] = yi;
			
 
				+                a[k1] = xr;
			
 
				+                a[k1 + 1] = xi;
			
 
				+            }
			
 
				+            k1 = 4 * k + 2 * ip[m + k];
			
 
				+            j1 = k1 + 2;
			
 
				+            k1 += nh;
			
 
				+            xr = a[j1];
			
 
				+            xi = a[j1 + 1];
			
 
				+            yr = a[k1];
			
 
				+            yi = a[k1 + 1];
			
 
				+            a[j1] = yr;
			
 
				+            a[j1 + 1] = yi;
			
 
				+            a[k1] = xr;
			
 
				+            a[k1 + 1] = xi;
			
 
				+            j1 += nm;
			
 
				+            k1 += 2 * nm;
			
 
				+            xr = a[j1];
			
 
				+            xi = a[j1 + 1];
			
 
				+            yr = a[k1];
			
 
				+            yi = a[k1 + 1];
			
 
				+            a[j1] = yr;
			
 
				+            a[j1 + 1] = yi;
			
 
				+            a[k1] = xr;
			
 
				+            a[k1 + 1] = xi;
			
 
				+            j1 += nm;
			
 
				+            k1 -= nm;
			
 
				+            xr = a[j1];
			
 
				+            xi = a[j1 + 1];
			
 
				+            yr = a[k1];
			
 
				+            yi = a[k1 + 1];
			
 
				+            a[j1] = yr;
			
 
				+            a[j1 + 1] = yi;
			
 
				+            a[k1] = xr;
			
 
				+            a[k1 + 1] = xi;
			
 
				+            j1 -= 2;
			
 
				+            k1 -= nh;
			
 
				+            xr = a[j1];
			
 
				+            xi = a[j1 + 1];
			
 
				+            yr = a[k1];
			
 
				+            yi = a[k1 + 1];
			
 
				+            a[j1] = yr;
			
 
				+            a[j1 + 1] = yi;
			
 
				+            a[k1] = xr;
			
 
				+            a[k1 + 1] = xi;
			
 
				+            j1 += nh + 2;
			
 
				+            k1 += nh + 2;
			
 
				+            xr = a[j1];
			
 
				+            xi = a[j1 + 1];
			
 
				+            yr = a[k1];
			
 
				+            yi = a[k1 + 1];
			
 
				+            a[j1] = yr;
			
 
				+            a[j1 + 1] = yi;
			
 
				+            a[k1] = xr;
			
 
				+            a[k1 + 1] = xi;
			
 
				+            j1 -= nh - nm;
			
 
				+            k1 += 2 * nm - 2;
			
 
				+            xr = a[j1];
			
 
				+            xi = a[j1 + 1];
			
 
				+            yr = a[k1];
			
 
				+            yi = a[k1 + 1];
			
 
				+            a[j1] = yr;
			
 
				+            a[j1 + 1] = yi;
			
 
				+            a[k1] = xr;
			
 
				+            a[k1 + 1] = xi;
			
 
				+        }
			
 
				+    } else {
			
 
				+        for (k = 0; k < m; k++) {
			
 
				+            for (j = 0; j < k; j++) {
			
 
				+                j1 = 4 * j + ip[m + k];
			
 
				+                k1 = 4 * k + ip[m + j];
			
 
				+                xr = a[j1];
			
 
				+                xi = a[j1 + 1];
			
 
				+                yr = a[k1];
			
 
				+                yi = a[k1 + 1];
			
 
				+                a[j1] = yr;
			
 
				+                a[j1 + 1] = yi;
			
 
				+                a[k1] = xr;
			
 
				+                a[k1 + 1] = xi;
			
 
				+                j1 += nm;
			
 
				+                k1 += nm;
			
 
				+                xr = a[j1];
			
 
				+                xi = a[j1 + 1];
			
 
				+                yr = a[k1];
			
 
				+                yi = a[k1 + 1];
			
 
				+                a[j1] = yr;
			
 
				+                a[j1 + 1] = yi;
			
 
				+                a[k1] = xr;
			
 
				+                a[k1 + 1] = xi;
			
 
				+                j1 += nh;
			
 
				+                k1 += 2;
			
 
				+                xr = a[j1];
			
 
				+                xi = a[j1 + 1];
			
 
				+                yr = a[k1];
			
 
				+                yi = a[k1 + 1];
			
 
				+                a[j1] = yr;
			
 
				+                a[j1 + 1] = yi;
			
 
				+                a[k1] = xr;
			
 
				+                a[k1 + 1] = xi;
			
 
				+                j1 -= nm;
			
 
				+                k1 -= nm;
			
 
				+                xr = a[j1];
			
 
				+                xi = a[j1 + 1];
			
 
				+                yr = a[k1];
			
 
				+                yi = a[k1 + 1];
			
 
				+                a[j1] = yr;
			
 
				+                a[j1 + 1] = yi;
			
 
				+                a[k1] = xr;
			
 
				+                a[k1 + 1] = xi;
			
 
				+                j1 += 2;
			
 
				+                k1 += nh;
			
 
				+                xr = a[j1];
			
 
				+                xi = a[j1 + 1];
			
 
				+                yr = a[k1];
			
 
				+                yi = a[k1 + 1];
			
 
				+                a[j1] = yr;
			
 
				+                a[j1 + 1] = yi;
			
 
				+                a[k1] = xr;
			
 
				+                a[k1 + 1] = xi;
			
 
				+                j1 += nm;
			
 
				+                k1 += nm;
			
 
				+                xr = a[j1];
			
 
				+                xi = a[j1 + 1];
			
 
				+                yr = a[k1];
			
 
				+                yi = a[k1 + 1];
			
 
				+                a[j1] = yr;
			
 
				+                a[j1 + 1] = yi;
			
 
				+                a[k1] = xr;
			
 
				+                a[k1 + 1] = xi;
			
 
				+                j1 -= nh;
			
 
				+                k1 -= 2;
			
 
				+                xr = a[j1];
			
 
				+                xi = a[j1 + 1];
			
 
				+                yr = a[k1];
			
 
				+                yi = a[k1 + 1];
			
 
				+                a[j1] = yr;
			
 
				+                a[j1 + 1] = yi;
			
 
				+                a[k1] = xr;
			
 
				+                a[k1 + 1] = xi;
			
 
				+                j1 -= nm;
			
 
				+                k1 -= nm;
			
 
				+                xr = a[j1];
			
 
				+                xi = a[j1 + 1];
			
 
				+                yr = a[k1];
			
 
				+                yi = a[k1 + 1];
			
 
				+                a[j1] = yr;
			
 
				+                a[j1 + 1] = yi;
			
 
				+                a[k1] = xr;
			
 
				+                a[k1 + 1] = xi;
			
 
				+            }
			
 
				+            k1 = 4 * k + ip[m + k];
			
 
				+            j1 = k1 + 2;
			
 
				+            k1 += nh;
			
 
				+            xr = a[j1];
			
 
				+            xi = a[j1 + 1];
			
 
				+            yr = a[k1];
			
 
				+            yi = a[k1 + 1];
			
 
				+            a[j1] = yr;
			
 
				+            a[j1 + 1] = yi;
			
 
				+            a[k1] = xr;
			
 
				+            a[k1 + 1] = xi;
			
 
				+            j1 += nm;
			
 
				+            k1 += nm;
			
 
				+            xr = a[j1];
			
 
				+            xi = a[j1 + 1];
			
 
				+            yr = a[k1];
			
 
				+            yi = a[k1 + 1];
			
 
				+            a[j1] = yr;
			
 
				+            a[j1 + 1] = yi;
			
 
				+            a[k1] = xr;
			
 
				+            a[k1 + 1] = xi;
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void bitrv2conj(int n, int *ip, double *a)
			
 
				+{
			
 
				+    int j, j1, k, k1, l, m, nh, nm;
			
 
				+    double xr, xi, yr, yi;
			
 
				+
			
 
				+    m = 1;
			
 
				+    for (l = n >> 2; l > 8; l >>= 2) {
			
 
				+        m <<= 1;
			
 
				+    }
			
 
				+    nh = n >> 1;
			
 
				+    nm = 4 * m;
			
 
				+    if (l == 8) {
			
 
				+        for (k = 0; k < m; k++) {
			
 
				+            for (j = 0; j < k; j++) {
			
 
				+                j1 = 4 * j + 2 * ip[m + k];
			
 
				+                k1 = 4 * k + 2 * ip[m + j];
			
 
				+                xr = a[j1];
			
 
				+                xi = -a[j1 + 1];
			
 
				+                yr = a[k1];
			
 
				+                yi = -a[k1 + 1];
			
 
				+                a[j1] = yr;
			
 
				+                a[j1 + 1] = yi;
			
 
				+                a[k1] = xr;
			
 
				+                a[k1 + 1] = xi;
			
 
				+                j1 += nm;
			
 
				+                k1 += 2 * nm;
			
 
				+                xr = a[j1];
			
 
				+                xi = -a[j1 + 1];
			
 
				+                yr = a[k1];
			
 
				+                yi = -a[k1 + 1];
			
 
				+                a[j1] = yr;
			
 
				+                a[j1 + 1] = yi;
			
 
				+                a[k1] = xr;
			
 
				+                a[k1 + 1] = xi;
			
 
				+                j1 += nm;
			
 
				+                k1 -= nm;
			
 
				+                xr = a[j1];
			
 
				+                xi = -a[j1 + 1];
			
 
				+                yr = a[k1];
			
 
				+                yi = -a[k1 + 1];
			
 
				+                a[j1] = yr;
			
 
				+                a[j1 + 1] = yi;
			
 
				+                a[k1] = xr;
			
 
				+                a[k1 + 1] = xi;
			
 
				+                j1 += nm;
			
 
				+                k1 += 2 * nm;
			
 
				+                xr = a[j1];
			
 
				+                xi = -a[j1 + 1];
			
 
				+                yr = a[k1];
			
 
				+                yi = -a[k1 + 1];
			
 
				+                a[j1] = yr;
			
 
				+                a[j1 + 1] = yi;
			
 
				+                a[k1] = xr;
			
 
				+                a[k1 + 1] = xi;
			
 
				+                j1 += nh;
			
 
				+                k1 += 2;
			
 
				+                xr = a[j1];
			
 
				+                xi = -a[j1 + 1];
			
 
				+                yr = a[k1];
			
 
				+                yi = -a[k1 + 1];
			
 
				+                a[j1] = yr;
			
 
				+                a[j1 + 1] = yi;
			
 
				+                a[k1] = xr;
			
 
				+                a[k1 + 1] = xi;
			
 
				+                j1 -= nm;
			
 
				+                k1 -= 2 * nm;
			
 
				+                xr = a[j1];
			
 
				+                xi = -a[j1 + 1];
			
 
				+                yr = a[k1];
			
 
				+                yi = -a[k1 + 1];
			
 
				+                a[j1] = yr;
			
 
				+                a[j1 + 1] = yi;
			
 
				+                a[k1] = xr;
			
 
				+                a[k1 + 1] = xi;
			
 
				+                j1 -= nm;
			
 
				+                k1 += nm;
			
 
				+                xr = a[j1];
			
 
				+                xi = -a[j1 + 1];
			
 
				+                yr = a[k1];
			
 
				+                yi = -a[k1 + 1];
			
 
				+                a[j1] = yr;
			
 
				+                a[j1 + 1] = yi;
			
 
				+                a[k1] = xr;
			
 
				+                a[k1 + 1] = xi;
			
 
				+                j1 -= nm;
			
 
				+                k1 -= 2 * nm;
			
 
				+                xr = a[j1];
			
 
				+                xi = -a[j1 + 1];
			
 
				+                yr = a[k1];
			
 
				+                yi = -a[k1 + 1];
			
 
				+                a[j1] = yr;
			
 
				+                a[j1 + 1] = yi;
			
 
				+                a[k1] = xr;
			
 
				+                a[k1 + 1] = xi;
			
 
				+                j1 += 2;
			
 
				+                k1 += nh;
			
 
				+                xr = a[j1];
			
 
				+                xi = -a[j1 + 1];
			
 
				+                yr = a[k1];
			
 
				+                yi = -a[k1 + 1];
			
 
				+                a[j1] = yr;
			
 
				+                a[j1 + 1] = yi;
			
 
				+                a[k1] = xr;
			
 
				+                a[k1 + 1] = xi;
			
 
				+                j1 += nm;
			
 
				+                k1 += 2 * nm;
			
 
				+                xr = a[j1];
			
 
				+                xi = -a[j1 + 1];
			
 
				+                yr = a[k1];
			
 
				+                yi = -a[k1 + 1];
			
 
				+                a[j1] = yr;
			
 
				+                a[j1 + 1] = yi;
			
 
				+                a[k1] = xr;
			
 
				+                a[k1 + 1] = xi;
			
 
				+                j1 += nm;
			
 
				+                k1 -= nm;
			
 
				+                xr = a[j1];
			
 
				+                xi = -a[j1 + 1];
			
 
				+                yr = a[k1];
			
 
				+                yi = -a[k1 + 1];
			
 
				+                a[j1] = yr;
			
 
				+                a[j1 + 1] = yi;
			
 
				+                a[k1] = xr;
			
 
				+                a[k1 + 1] = xi;
			
 
				+                j1 += nm;
			
 
				+                k1 += 2 * nm;
			
 
				+                xr = a[j1];
			
 
				+                xi = -a[j1 + 1];
			
 
				+                yr = a[k1];
			
 
				+                yi = -a[k1 + 1];
			
 
				+                a[j1] = yr;
			
 
				+                a[j1 + 1] = yi;
			
 
				+                a[k1] = xr;
			
 
				+                a[k1 + 1] = xi;
			
 
				+                j1 -= nh;
			
 
				+                k1 -= 2;
			
 
				+                xr = a[j1];
			
 
				+                xi = -a[j1 + 1];
			
 
				+                yr = a[k1];
			
 
				+                yi = -a[k1 + 1];
			
 
				+                a[j1] = yr;
			
 
				+                a[j1 + 1] = yi;
			
 
				+                a[k1] = xr;
			
 
				+                a[k1 + 1] = xi;
			
 
				+                j1 -= nm;
			
 
				+                k1 -= 2 * nm;
			
 
				+                xr = a[j1];
			
 
				+                xi = -a[j1 + 1];
			
 
				+                yr = a[k1];
			
 
				+                yi = -a[k1 + 1];
			
 
				+                a[j1] = yr;
			
 
				+                a[j1 + 1] = yi;
			
 
				+                a[k1] = xr;
			
 
				+                a[k1 + 1] = xi;
			
 
				+                j1 -= nm;
			
 
				+                k1 += nm;
			
 
				+                xr = a[j1];
			
 
				+                xi = -a[j1 + 1];
			
 
				+                yr = a[k1];
			
 
				+                yi = -a[k1 + 1];
			
 
				+                a[j1] = yr;
			
 
				+                a[j1 + 1] = yi;
			
 
				+                a[k1] = xr;
			
 
				+                a[k1 + 1] = xi;
			
 
				+                j1 -= nm;
			
 
				+                k1 -= 2 * nm;
			
 
				+                xr = a[j1];
			
 
				+                xi = -a[j1 + 1];
			
 
				+                yr = a[k1];
			
 
				+                yi = -a[k1 + 1];
			
 
				+                a[j1] = yr;
			
 
				+                a[j1 + 1] = yi;
			
 
				+                a[k1] = xr;
			
 
				+                a[k1 + 1] = xi;
			
 
				+            }
			
 
				+            k1 = 4 * k + 2 * ip[m + k];
			
 
				+            j1 = k1 + 2;
			
 
				+            k1 += nh;
			
 
				+            a[j1 - 1] = -a[j1 - 1];
			
 
				+            xr = a[j1];
			
 
				+            xi = -a[j1 + 1];
			
 
				+            yr = a[k1];
			
 
				+            yi = -a[k1 + 1];
			
 
				+            a[j1] = yr;
			
 
				+            a[j1 + 1] = yi;
			
 
				+            a[k1] = xr;
			
 
				+            a[k1 + 1] = xi;
			
 
				+            a[k1 + 3] = -a[k1 + 3];
			
 
				+            j1 += nm;
			
 
				+            k1 += 2 * nm;
			
 
				+            xr = a[j1];
			
 
				+            xi = -a[j1 + 1];
			
 
				+            yr = a[k1];
			
 
				+            yi = -a[k1 + 1];
			
 
				+            a[j1] = yr;
			
 
				+            a[j1 + 1] = yi;
			
 
				+            a[k1] = xr;
			
 
				+            a[k1 + 1] = xi;
			
 
				+            j1 += nm;
			
 
				+            k1 -= nm;
			
 
				+            xr = a[j1];
			
 
				+            xi = -a[j1 + 1];
			
 
				+            yr = a[k1];
			
 
				+            yi = -a[k1 + 1];
			
 
				+            a[j1] = yr;
			
 
				+            a[j1 + 1] = yi;
			
 
				+            a[k1] = xr;
			
 
				+            a[k1 + 1] = xi;
			
 
				+            j1 -= 2;
			
 
				+            k1 -= nh;
			
 
				+            xr = a[j1];
			
 
				+            xi = -a[j1 + 1];
			
 
				+            yr = a[k1];
			
 
				+            yi = -a[k1 + 1];
			
 
				+            a[j1] = yr;
			
 
				+            a[j1 + 1] = yi;
			
 
				+            a[k1] = xr;
			
 
				+            a[k1 + 1] = xi;
			
 
				+            j1 += nh + 2;
			
 
				+            k1 += nh + 2;
			
 
				+            xr = a[j1];
			
 
				+            xi = -a[j1 + 1];
			
 
				+            yr = a[k1];
			
 
				+            yi = -a[k1 + 1];
			
 
				+            a[j1] = yr;
			
 
				+            a[j1 + 1] = yi;
			
 
				+            a[k1] = xr;
			
 
				+            a[k1 + 1] = xi;
			
 
				+            j1 -= nh - nm;
			
 
				+            k1 += 2 * nm - 2;
			
 
				+            a[j1 - 1] = -a[j1 - 1];
			
 
				+            xr = a[j1];
			
 
				+            xi = -a[j1 + 1];
			
 
				+            yr = a[k1];
			
 
				+            yi = -a[k1 + 1];
			
 
				+            a[j1] = yr;
			
 
				+            a[j1 + 1] = yi;
			
 
				+            a[k1] = xr;
			
 
				+            a[k1 + 1] = xi;
			
 
				+            a[k1 + 3] = -a[k1 + 3];
			
 
				+        }
			
 
				+    } else {
			
 
				+        for (k = 0; k < m; k++) {
			
 
				+            for (j = 0; j < k; j++) {
			
 
				+                j1 = 4 * j + ip[m + k];
			
 
				+                k1 = 4 * k + ip[m + j];
			
 
				+                xr = a[j1];
			
 
				+                xi = -a[j1 + 1];
			
 
				+                yr = a[k1];
			
 
				+                yi = -a[k1 + 1];
			
 
				+                a[j1] = yr;
			
 
				+                a[j1 + 1] = yi;
			
 
				+                a[k1] = xr;
			
 
				+                a[k1 + 1] = xi;
			
 
				+                j1 += nm;
			
 
				+                k1 += nm;
			
 
				+                xr = a[j1];
			
 
				+                xi = -a[j1 + 1];
			
 
				+                yr = a[k1];
			
 
				+                yi = -a[k1 + 1];
			
 
				+                a[j1] = yr;
			
 
				+                a[j1 + 1] = yi;
			
 
				+                a[k1] = xr;
			
 
				+                a[k1 + 1] = xi;
			
 
				+                j1 += nh;
			
 
				+                k1 += 2;
			
 
				+                xr = a[j1];
			
 
				+                xi = -a[j1 + 1];
			
 
				+                yr = a[k1];
			
 
				+                yi = -a[k1 + 1];
			
 
				+                a[j1] = yr;
			
 
				+                a[j1 + 1] = yi;
			
 
				+                a[k1] = xr;
			
 
				+                a[k1 + 1] = xi;
			
 
				+                j1 -= nm;
			
 
				+                k1 -= nm;
			
 
				+                xr = a[j1];
			
 
				+                xi = -a[j1 + 1];
			
 
				+                yr = a[k1];
			
 
				+                yi = -a[k1 + 1];
			
 
				+                a[j1] = yr;
			
 
				+                a[j1 + 1] = yi;
			
 
				+                a[k1] = xr;
			
 
				+                a[k1 + 1] = xi;
			
 
				+                j1 += 2;
			
 
				+                k1 += nh;
			
 
				+                xr = a[j1];
			
 
				+                xi = -a[j1 + 1];
			
 
				+                yr = a[k1];
			
 
				+                yi = -a[k1 + 1];
			
 
				+                a[j1] = yr;
			
 
				+                a[j1 + 1] = yi;
			
 
				+                a[k1] = xr;
			
 
				+                a[k1 + 1] = xi;
			
 
				+                j1 += nm;
			
 
				+                k1 += nm;
			
 
				+                xr = a[j1];
			
 
				+                xi = -a[j1 + 1];
			
 
				+                yr = a[k1];
			
 
				+                yi = -a[k1 + 1];
			
 
				+                a[j1] = yr;
			
 
				+                a[j1 + 1] = yi;
			
 
				+                a[k1] = xr;
			
 
				+                a[k1 + 1] = xi;
			
 
				+                j1 -= nh;
			
 
				+                k1 -= 2;
			
 
				+                xr = a[j1];
			
 
				+                xi = -a[j1 + 1];
			
 
				+                yr = a[k1];
			
 
				+                yi = -a[k1 + 1];
			
 
				+                a[j1] = yr;
			
 
				+                a[j1 + 1] = yi;
			
 
				+                a[k1] = xr;
			
 
				+                a[k1 + 1] = xi;
			
 
				+                j1 -= nm;
			
 
				+                k1 -= nm;
			
 
				+                xr = a[j1];
			
 
				+                xi = -a[j1 + 1];
			
 
				+                yr = a[k1];
			
 
				+                yi = -a[k1 + 1];
			
 
				+                a[j1] = yr;
			
 
				+                a[j1 + 1] = yi;
			
 
				+                a[k1] = xr;
			
 
				+                a[k1 + 1] = xi;
			
 
				+            }
			
 
				+            k1 = 4 * k + ip[m + k];
			
 
				+            j1 = k1 + 2;
			
 
				+            k1 += nh;
			
 
				+            a[j1 - 1] = -a[j1 - 1];
			
 
				+            xr = a[j1];
			
 
				+            xi = -a[j1 + 1];
			
 
				+            yr = a[k1];
			
 
				+            yi = -a[k1 + 1];
			
 
				+            a[j1] = yr;
			
 
				+            a[j1 + 1] = yi;
			
 
				+            a[k1] = xr;
			
 
				+            a[k1 + 1] = xi;
			
 
				+            a[k1 + 3] = -a[k1 + 3];
			
 
				+            j1 += nm;
			
 
				+            k1 += nm;
			
 
				+            a[j1 - 1] = -a[j1 - 1];
			
 
				+            xr = a[j1];
			
 
				+            xi = -a[j1 + 1];
			
 
				+            yr = a[k1];
			
 
				+            yi = -a[k1 + 1];
			
 
				+            a[j1] = yr;
			
 
				+            a[j1 + 1] = yi;
			
 
				+            a[k1] = xr;
			
 
				+            a[k1 + 1] = xi;
			
 
				+            a[k1 + 3] = -a[k1 + 3];
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void bitrv216(double *a)
			
 
				+{
			
 
				+    double x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i,
			
 
				+        x5r, x5i, x7r, x7i, x8r, x8i, x10r, x10i,
			
 
				+        x11r, x11i, x12r, x12i, x13r, x13i, x14r, x14i;
			
 
				+
			
 
				+    x1r = a[2];
			
 
				+    x1i = a[3];
			
 
				+    x2r = a[4];
			
 
				+    x2i = a[5];
			
 
				+    x3r = a[6];
			
 
				+    x3i = a[7];
			
 
				+    x4r = a[8];
			
 
				+    x4i = a[9];
			
 
				+    x5r = a[10];
			
 
				+    x5i = a[11];
			
 
				+    x7r = a[14];
			
 
				+    x7i = a[15];
			
 
				+    x8r = a[16];
			
 
				+    x8i = a[17];
			
 
				+    x10r = a[20];
			
 
				+    x10i = a[21];
			
 
				+    x11r = a[22];
			
 
				+    x11i = a[23];
			
 
				+    x12r = a[24];
			
 
				+    x12i = a[25];
			
 
				+    x13r = a[26];
			
 
				+    x13i = a[27];
			
 
				+    x14r = a[28];
			
 
				+    x14i = a[29];
			
 
				+    a[2] = x8r;
			
 
				+    a[3] = x8i;
			
 
				+    a[4] = x4r;
			
 
				+    a[5] = x4i;
			
 
				+    a[6] = x12r;
			
 
				+    a[7] = x12i;
			
 
				+    a[8] = x2r;
			
 
				+    a[9] = x2i;
			
 
				+    a[10] = x10r;
			
 
				+    a[11] = x10i;
			
 
				+    a[14] = x14r;
			
 
				+    a[15] = x14i;
			
 
				+    a[16] = x1r;
			
 
				+    a[17] = x1i;
			
 
				+    a[20] = x5r;
			
 
				+    a[21] = x5i;
			
 
				+    a[22] = x13r;
			
 
				+    a[23] = x13i;
			
 
				+    a[24] = x3r;
			
 
				+    a[25] = x3i;
			
 
				+    a[26] = x11r;
			
 
				+    a[27] = x11i;
			
 
				+    a[28] = x7r;
			
 
				+    a[29] = x7i;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void bitrv216neg(double *a)
			
 
				+{
			
 
				+    double x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i,
			
 
				+        x5r, x5i, x6r, x6i, x7r, x7i, x8r, x8i,
			
 
				+        x9r, x9i, x10r, x10i, x11r, x11i, x12r, x12i,
			
 
				+        x13r, x13i, x14r, x14i, x15r, x15i;
			
 
				+
			
 
				+    x1r = a[2];
			
 
				+    x1i = a[3];
			
 
				+    x2r = a[4];
			
 
				+    x2i = a[5];
			
 
				+    x3r = a[6];
			
 
				+    x3i = a[7];
			
 
				+    x4r = a[8];
			
 
				+    x4i = a[9];
			
 
				+    x5r = a[10];
			
 
				+    x5i = a[11];
			
 
				+    x6r = a[12];
			
 
				+    x6i = a[13];
			
 
				+    x7r = a[14];
			
 
				+    x7i = a[15];
			
 
				+    x8r = a[16];
			
 
				+    x8i = a[17];
			
 
				+    x9r = a[18];
			
 
				+    x9i = a[19];
			
 
				+    x10r = a[20];
			
 
				+    x10i = a[21];
			
 
				+    x11r = a[22];
			
 
				+    x11i = a[23];
			
 
				+    x12r = a[24];
			
 
				+    x12i = a[25];
			
 
				+    x13r = a[26];
			
 
				+    x13i = a[27];
			
 
				+    x14r = a[28];
			
 
				+    x14i = a[29];
			
 
				+    x15r = a[30];
			
 
				+    x15i = a[31];
			
 
				+    a[2] = x15r;
			
 
				+    a[3] = x15i;
			
 
				+    a[4] = x7r;
			
 
				+    a[5] = x7i;
			
 
				+    a[6] = x11r;
			
 
				+    a[7] = x11i;
			
 
				+    a[8] = x3r;
			
 
				+    a[9] = x3i;
			
 
				+    a[10] = x13r;
			
 
				+    a[11] = x13i;
			
 
				+    a[12] = x5r;
			
 
				+    a[13] = x5i;
			
 
				+    a[14] = x9r;
			
 
				+    a[15] = x9i;
			
 
				+    a[16] = x1r;
			
 
				+    a[17] = x1i;
			
 
				+    a[18] = x14r;
			
 
				+    a[19] = x14i;
			
 
				+    a[20] = x6r;
			
 
				+    a[21] = x6i;
			
 
				+    a[22] = x10r;
			
 
				+    a[23] = x10i;
			
 
				+    a[24] = x2r;
			
 
				+    a[25] = x2i;
			
 
				+    a[26] = x12r;
			
 
				+    a[27] = x12i;
			
 
				+    a[28] = x4r;
			
 
				+    a[29] = x4i;
			
 
				+    a[30] = x8r;
			
 
				+    a[31] = x8i;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void bitrv208(double *a)
			
 
				+{
			
 
				+    double x1r, x1i, x3r, x3i, x4r, x4i, x6r, x6i;
			
 
				+
			
 
				+    x1r = a[2];
			
 
				+    x1i = a[3];
			
 
				+    x3r = a[6];
			
 
				+    x3i = a[7];
			
 
				+    x4r = a[8];
			
 
				+    x4i = a[9];
			
 
				+    x6r = a[12];
			
 
				+    x6i = a[13];
			
 
				+    a[2] = x4r;
			
 
				+    a[3] = x4i;
			
 
				+    a[6] = x6r;
			
 
				+    a[7] = x6i;
			
 
				+    a[8] = x1r;
			
 
				+    a[9] = x1i;
			
 
				+    a[12] = x3r;
			
 
				+    a[13] = x3i;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void bitrv208neg(double *a)
			
 
				+{
			
 
				+    double x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i,
			
 
				+        x5r, x5i, x6r, x6i, x7r, x7i;
			
 
				+
			
 
				+    x1r = a[2];
			
 
				+    x1i = a[3];
			
 
				+    x2r = a[4];
			
 
				+    x2i = a[5];
			
 
				+    x3r = a[6];
			
 
				+    x3i = a[7];
			
 
				+    x4r = a[8];
			
 
				+    x4i = a[9];
			
 
				+    x5r = a[10];
			
 
				+    x5i = a[11];
			
 
				+    x6r = a[12];
			
 
				+    x6i = a[13];
			
 
				+    x7r = a[14];
			
 
				+    x7i = a[15];
			
 
				+    a[2] = x7r;
			
 
				+    a[3] = x7i;
			
 
				+    a[4] = x3r;
			
 
				+    a[5] = x3i;
			
 
				+    a[6] = x5r;
			
 
				+    a[7] = x5i;
			
 
				+    a[8] = x1r;
			
 
				+    a[9] = x1i;
			
 
				+    a[10] = x6r;
			
 
				+    a[11] = x6i;
			
 
				+    a[12] = x2r;
			
 
				+    a[13] = x2i;
			
 
				+    a[14] = x4r;
			
 
				+    a[15] = x4i;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void cftf1st(int n, double *a, double *w)
			
 
				+{
			
 
				+    int j, j0, j1, j2, j3, k, m, mh;
			
 
				+    double wn4r, csc1, csc3, wk1r, wk1i, wk3r, wk3i,
			
 
				+        wd1r, wd1i, wd3r, wd3i;
			
 
				+    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i,
			
 
				+        y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i;
			
 
				+
			
 
				+    mh = n >> 3;
			
 
				+    m = 2 * mh;
			
 
				+    j1 = m;
			
 
				+    j2 = j1 + m;
			
 
				+    j3 = j2 + m;
			
 
				+    x0r = a[0] + a[j2];
			
 
				+    x0i = a[1] + a[j2 + 1];
			
 
				+    x1r = a[0] - a[j2];
			
 
				+    x1i = a[1] - a[j2 + 1];
			
 
				+    x2r = a[j1] + a[j3];
			
 
				+    x2i = a[j1 + 1] + a[j3 + 1];
			
 
				+    x3r = a[j1] - a[j3];
			
 
				+    x3i = a[j1 + 1] - a[j3 + 1];
			
 
				+    a[0] = x0r + x2r;
			
 
				+    a[1] = x0i + x2i;
			
 
				+    a[j1] = x0r - x2r;
			
 
				+    a[j1 + 1] = x0i - x2i;
			
 
				+    a[j2] = x1r - x3i;
			
 
				+    a[j2 + 1] = x1i + x3r;
			
 
				+    a[j3] = x1r + x3i;
			
 
				+    a[j3 + 1] = x1i - x3r;
			
 
				+    wn4r = w[1];
			
 
				+    csc1 = w[2];
			
 
				+    csc3 = w[3];
			
 
				+    wd1r = 1;
			
 
				+    wd1i = 0;
			
 
				+    wd3r = 1;
			
 
				+    wd3i = 0;
			
 
				+    k = 0;
			
 
				+    for (j = 2; j < mh - 2; j += 4) {
			
 
				+        k += 4;
			
 
				+        wk1r = csc1 * (wd1r + w[k]);
			
 
				+        wk1i = csc1 * (wd1i + w[k + 1]);
			
 
				+        wk3r = csc3 * (wd3r + w[k + 2]);
			
 
				+        wk3i = csc3 * (wd3i + w[k + 3]);
			
 
				+        wd1r = w[k];
			
 
				+        wd1i = w[k + 1];
			
 
				+        wd3r = w[k + 2];
			
 
				+        wd3i = w[k + 3];
			
 
				+        j1 = j + m;
			
 
				+        j2 = j1 + m;
			
 
				+        j3 = j2 + m;
			
 
				+        x0r = a[j] + a[j2];
			
 
				+        x0i = a[j + 1] + a[j2 + 1];
			
 
				+        x1r = a[j] - a[j2];
			
 
				+        x1i = a[j + 1] - a[j2 + 1];
			
 
				+        y0r = a[j + 2] + a[j2 + 2];
			
 
				+        y0i = a[j + 3] + a[j2 + 3];
			
 
				+        y1r = a[j + 2] - a[j2 + 2];
			
 
				+        y1i = a[j + 3] - a[j2 + 3];
			
 
				+        x2r = a[j1] + a[j3];
			
 
				+        x2i = a[j1 + 1] + a[j3 + 1];
			
 
				+        x3r = a[j1] - a[j3];
			
 
				+        x3i = a[j1 + 1] - a[j3 + 1];
			
 
				+        y2r = a[j1 + 2] + a[j3 + 2];
			
 
				+        y2i = a[j1 + 3] + a[j3 + 3];
			
 
				+        y3r = a[j1 + 2] - a[j3 + 2];
			
 
				+        y3i = a[j1 + 3] - a[j3 + 3];
			
 
				+        a[j] = x0r + x2r;
			
 
				+        a[j + 1] = x0i + x2i;
			
 
				+        a[j + 2] = y0r + y2r;
			
 
				+        a[j + 3] = y0i + y2i;
			
 
				+        a[j1] = x0r - x2r;
			
 
				+        a[j1 + 1] = x0i - x2i;
			
 
				+        a[j1 + 2] = y0r - y2r;
			
 
				+        a[j1 + 3] = y0i - y2i;
			
 
				+        x0r = x1r - x3i;
			
 
				+        x0i = x1i + x3r;
			
 
				+        a[j2] = wk1r * x0r - wk1i * x0i;
			
 
				+        a[j2 + 1] = wk1r * x0i + wk1i * x0r;
			
 
				+        x0r = y1r - y3i;
			
 
				+        x0i = y1i + y3r;
			
 
				+        a[j2 + 2] = wd1r * x0r - wd1i * x0i;
			
 
				+        a[j2 + 3] = wd1r * x0i + wd1i * x0r;
			
 
				+        x0r = x1r + x3i;
			
 
				+        x0i = x1i - x3r;
			
 
				+        a[j3] = wk3r * x0r + wk3i * x0i;
			
 
				+        a[j3 + 1] = wk3r * x0i - wk3i * x0r;
			
 
				+        x0r = y1r + y3i;
			
 
				+        x0i = y1i - y3r;
			
 
				+        a[j3 + 2] = wd3r * x0r + wd3i * x0i;
			
 
				+        a[j3 + 3] = wd3r * x0i - wd3i * x0r;
			
 
				+        j0 = m - j;
			
 
				+        j1 = j0 + m;
			
 
				+        j2 = j1 + m;
			
 
				+        j3 = j2 + m;
			
 
				+        x0r = a[j0] + a[j2];
			
 
				+        x0i = a[j0 + 1] + a[j2 + 1];
			
 
				+        x1r = a[j0] - a[j2];
			
 
				+        x1i = a[j0 + 1] - a[j2 + 1];
			
 
				+        y0r = a[j0 - 2] + a[j2 - 2];
			
 
				+        y0i = a[j0 - 1] + a[j2 - 1];
			
 
				+        y1r = a[j0 - 2] - a[j2 - 2];
			
 
				+        y1i = a[j0 - 1] - a[j2 - 1];
			
 
				+        x2r = a[j1] + a[j3];
			
 
				+        x2i = a[j1 + 1] + a[j3 + 1];
			
 
				+        x3r = a[j1] - a[j3];
			
 
				+        x3i = a[j1 + 1] - a[j3 + 1];
			
 
				+        y2r = a[j1 - 2] + a[j3 - 2];
			
 
				+        y2i = a[j1 - 1] + a[j3 - 1];
			
 
				+        y3r = a[j1 - 2] - a[j3 - 2];
			
 
				+        y3i = a[j1 - 1] - a[j3 - 1];
			
 
				+        a[j0] = x0r + x2r;
			
 
				+        a[j0 + 1] = x0i + x2i;
			
 
				+        a[j0 - 2] = y0r + y2r;
			
 
				+        a[j0 - 1] = y0i + y2i;
			
 
				+        a[j1] = x0r - x2r;
			
 
				+        a[j1 + 1] = x0i - x2i;
			
 
				+        a[j1 - 2] = y0r - y2r;
			
 
				+        a[j1 - 1] = y0i - y2i;
			
 
				+        x0r = x1r - x3i;
			
 
				+        x0i = x1i + x3r;
			
 
				+        a[j2] = wk1i * x0r - wk1r * x0i;
			
 
				+        a[j2 + 1] = wk1i * x0i + wk1r * x0r;
			
 
				+        x0r = y1r - y3i;
			
 
				+        x0i = y1i + y3r;
			
 
				+        a[j2 - 2] = wd1i * x0r - wd1r * x0i;
			
 
				+        a[j2 - 1] = wd1i * x0i + wd1r * x0r;
			
 
				+        x0r = x1r + x3i;
			
 
				+        x0i = x1i - x3r;
			
 
				+        a[j3] = wk3i * x0r + wk3r * x0i;
			
 
				+        a[j3 + 1] = wk3i * x0i - wk3r * x0r;
			
 
				+        x0r = y1r + y3i;
			
 
				+        x0i = y1i - y3r;
			
 
				+        a[j3 - 2] = wd3i * x0r + wd3r * x0i;
			
 
				+        a[j3 - 1] = wd3i * x0i - wd3r * x0r;
			
 
				+    }
			
 
				+    wk1r = csc1 * (wd1r + wn4r);
			
 
				+    wk1i = csc1 * (wd1i + wn4r);
			
 
				+    wk3r = csc3 * (wd3r - wn4r);
			
 
				+    wk3i = csc3 * (wd3i - wn4r);
			
 
				+    j0 = mh;
			
 
				+    j1 = j0 + m;
			
 
				+    j2 = j1 + m;
			
 
				+    j3 = j2 + m;
			
 
				+    x0r = a[j0 - 2] + a[j2 - 2];
			
 
				+    x0i = a[j0 - 1] + a[j2 - 1];
			
 
				+    x1r = a[j0 - 2] - a[j2 - 2];
			
 
				+    x1i = a[j0 - 1] - a[j2 - 1];
			
 
				+    x2r = a[j1 - 2] + a[j3 - 2];
			
 
				+    x2i = a[j1 - 1] + a[j3 - 1];
			
 
				+    x3r = a[j1 - 2] - a[j3 - 2];
			
 
				+    x3i = a[j1 - 1] - a[j3 - 1];
			
 
				+    a[j0 - 2] = x0r + x2r;
			
 
				+    a[j0 - 1] = x0i + x2i;
			
 
				+    a[j1 - 2] = x0r - x2r;
			
 
				+    a[j1 - 1] = x0i - x2i;
			
 
				+    x0r = x1r - x3i;
			
 
				+    x0i = x1i + x3r;
			
 
				+    a[j2 - 2] = wk1r * x0r - wk1i * x0i;
			
 
				+    a[j2 - 1] = wk1r * x0i + wk1i * x0r;
			
 
				+    x0r = x1r + x3i;
			
 
				+    x0i = x1i - x3r;
			
 
				+    a[j3 - 2] = wk3r * x0r + wk3i * x0i;
			
 
				+    a[j3 - 1] = wk3r * x0i - wk3i * x0r;
			
 
				+    x0r = a[j0] + a[j2];
			
 
				+    x0i = a[j0 + 1] + a[j2 + 1];
			
 
				+    x1r = a[j0] - a[j2];
			
 
				+    x1i = a[j0 + 1] - a[j2 + 1];
			
 
				+    x2r = a[j1] + a[j3];
			
 
				+    x2i = a[j1 + 1] + a[j3 + 1];
			
 
				+    x3r = a[j1] - a[j3];
			
 
				+    x3i = a[j1 + 1] - a[j3 + 1];
			
 
				+    a[j0] = x0r + x2r;
			
 
				+    a[j0 + 1] = x0i + x2i;
			
 
				+    a[j1] = x0r - x2r;
			
 
				+    a[j1 + 1] = x0i - x2i;
			
 
				+    x0r = x1r - x3i;
			
 
				+    x0i = x1i + x3r;
			
 
				+    a[j2] = wn4r * (x0r - x0i);
			
 
				+    a[j2 + 1] = wn4r * (x0i + x0r);
			
 
				+    x0r = x1r + x3i;
			
 
				+    x0i = x1i - x3r;
			
 
				+    a[j3] = -wn4r * (x0r + x0i);
			
 
				+    a[j3 + 1] = -wn4r * (x0i - x0r);
			
 
				+    x0r = a[j0 + 2] + a[j2 + 2];
			
 
				+    x0i = a[j0 + 3] + a[j2 + 3];
			
 
				+    x1r = a[j0 + 2] - a[j2 + 2];
			
 
				+    x1i = a[j0 + 3] - a[j2 + 3];
			
 
				+    x2r = a[j1 + 2] + a[j3 + 2];
			
 
				+    x2i = a[j1 + 3] + a[j3 + 3];
			
 
				+    x3r = a[j1 + 2] - a[j3 + 2];
			
 
				+    x3i = a[j1 + 3] - a[j3 + 3];
			
 
				+    a[j0 + 2] = x0r + x2r;
			
 
				+    a[j0 + 3] = x0i + x2i;
			
 
				+    a[j1 + 2] = x0r - x2r;
			
 
				+    a[j1 + 3] = x0i - x2i;
			
 
				+    x0r = x1r - x3i;
			
 
				+    x0i = x1i + x3r;
			
 
				+    a[j2 + 2] = wk1i * x0r - wk1r * x0i;
			
 
				+    a[j2 + 3] = wk1i * x0i + wk1r * x0r;
			
 
				+    x0r = x1r + x3i;
			
 
				+    x0i = x1i - x3r;
			
 
				+    a[j3 + 2] = wk3i * x0r + wk3r * x0i;
			
 
				+    a[j3 + 3] = wk3i * x0i - wk3r * x0r;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void cftb1st(int n, double *a, double *w)
			
 
				+{
			
 
				+    int j, j0, j1, j2, j3, k, m, mh;
			
 
				+    double wn4r, csc1, csc3, wk1r, wk1i, wk3r, wk3i,
			
 
				+        wd1r, wd1i, wd3r, wd3i;
			
 
				+    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i,
			
 
				+        y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i;
			
 
				+
			
 
				+    mh = n >> 3;
			
 
				+    m = 2 * mh;
			
 
				+    j1 = m;
			
 
				+    j2 = j1 + m;
			
 
				+    j3 = j2 + m;
			
 
				+    x0r = a[0] + a[j2];
			
 
				+    x0i = -a[1] - a[j2 + 1];
			
 
				+    x1r = a[0] - a[j2];
			
 
				+    x1i = -a[1] + a[j2 + 1];
			
 
				+    x2r = a[j1] + a[j3];
			
 
				+    x2i = a[j1 + 1] + a[j3 + 1];
			
 
				+    x3r = a[j1] - a[j3];
			
 
				+    x3i = a[j1 + 1] - a[j3 + 1];
			
 
				+    a[0] = x0r + x2r;
			
 
				+    a[1] = x0i - x2i;
			
 
				+    a[j1] = x0r - x2r;
			
 
				+    a[j1 + 1] = x0i + x2i;
			
 
				+    a[j2] = x1r + x3i;
			
 
				+    a[j2 + 1] = x1i + x3r;
			
 
				+    a[j3] = x1r - x3i;
			
 
				+    a[j3 + 1] = x1i - x3r;
			
 
				+    wn4r = w[1];
			
 
				+    csc1 = w[2];
			
 
				+    csc3 = w[3];
			
 
				+    wd1r = 1;
			
 
				+    wd1i = 0;
			
 
				+    wd3r = 1;
			
 
				+    wd3i = 0;
			
 
				+    k = 0;
			
 
				+    for (j = 2; j < mh - 2; j += 4) {
			
 
				+        k += 4;
			
 
				+        wk1r = csc1 * (wd1r + w[k]);
			
 
				+        wk1i = csc1 * (wd1i + w[k + 1]);
			
 
				+        wk3r = csc3 * (wd3r + w[k + 2]);
			
 
				+        wk3i = csc3 * (wd3i + w[k + 3]);
			
 
				+        wd1r = w[k];
			
 
				+        wd1i = w[k + 1];
			
 
				+        wd3r = w[k + 2];
			
 
				+        wd3i = w[k + 3];
			
 
				+        j1 = j + m;
			
 
				+        j2 = j1 + m;
			
 
				+        j3 = j2 + m;
			
 
				+        x0r = a[j] + a[j2];
			
 
				+        x0i = -a[j + 1] - a[j2 + 1];
			
 
				+        x1r = a[j] - a[j2];
			
 
				+        x1i = -a[j + 1] + a[j2 + 1];
			
 
				+        y0r = a[j + 2] + a[j2 + 2];
			
 
				+        y0i = -a[j + 3] - a[j2 + 3];
			
 
				+        y1r = a[j + 2] - a[j2 + 2];
			
 
				+        y1i = -a[j + 3] + a[j2 + 3];
			
 
				+        x2r = a[j1] + a[j3];
			
 
				+        x2i = a[j1 + 1] + a[j3 + 1];
			
 
				+        x3r = a[j1] - a[j3];
			
 
				+        x3i = a[j1 + 1] - a[j3 + 1];
			
 
				+        y2r = a[j1 + 2] + a[j3 + 2];
			
 
				+        y2i = a[j1 + 3] + a[j3 + 3];
			
 
				+        y3r = a[j1 + 2] - a[j3 + 2];
			
 
				+        y3i = a[j1 + 3] - a[j3 + 3];
			
 
				+        a[j] = x0r + x2r;
			
 
				+        a[j + 1] = x0i - x2i;
			
 
				+        a[j + 2] = y0r + y2r;
			
 
				+        a[j + 3] = y0i - y2i;
			
 
				+        a[j1] = x0r - x2r;
			
 
				+        a[j1 + 1] = x0i + x2i;
			
 
				+        a[j1 + 2] = y0r - y2r;
			
 
				+        a[j1 + 3] = y0i + y2i;
			
 
				+        x0r = x1r + x3i;
			
 
				+        x0i = x1i + x3r;
			
 
				+        a[j2] = wk1r * x0r - wk1i * x0i;
			
 
				+        a[j2 + 1] = wk1r * x0i + wk1i * x0r;
			
 
				+        x0r = y1r + y3i;
			
 
				+        x0i = y1i + y3r;
			
 
				+        a[j2 + 2] = wd1r * x0r - wd1i * x0i;
			
 
				+        a[j2 + 3] = wd1r * x0i + wd1i * x0r;
			
 
				+        x0r = x1r - x3i;
			
 
				+        x0i = x1i - x3r;
			
 
				+        a[j3] = wk3r * x0r + wk3i * x0i;
			
 
				+        a[j3 + 1] = wk3r * x0i - wk3i * x0r;
			
 
				+        x0r = y1r - y3i;
			
 
				+        x0i = y1i - y3r;
			
 
				+        a[j3 + 2] = wd3r * x0r + wd3i * x0i;
			
 
				+        a[j3 + 3] = wd3r * x0i - wd3i * x0r;
			
 
				+        j0 = m - j;
			
 
				+        j1 = j0 + m;
			
 
				+        j2 = j1 + m;
			
 
				+        j3 = j2 + m;
			
 
				+        x0r = a[j0] + a[j2];
			
 
				+        x0i = -a[j0 + 1] - a[j2 + 1];
			
 
				+        x1r = a[j0] - a[j2];
			
 
				+        x1i = -a[j0 + 1] + a[j2 + 1];
			
 
				+        y0r = a[j0 - 2] + a[j2 - 2];
			
 
				+        y0i = -a[j0 - 1] - a[j2 - 1];
			
 
				+        y1r = a[j0 - 2] - a[j2 - 2];
			
 
				+        y1i = -a[j0 - 1] + a[j2 - 1];
			
 
				+        x2r = a[j1] + a[j3];
			
 
				+        x2i = a[j1 + 1] + a[j3 + 1];
			
 
				+        x3r = a[j1] - a[j3];
			
 
				+        x3i = a[j1 + 1] - a[j3 + 1];
			
 
				+        y2r = a[j1 - 2] + a[j3 - 2];
			
 
				+        y2i = a[j1 - 1] + a[j3 - 1];
			
 
				+        y3r = a[j1 - 2] - a[j3 - 2];
			
 
				+        y3i = a[j1 - 1] - a[j3 - 1];
			
 
				+        a[j0] = x0r + x2r;
			
 
				+        a[j0 + 1] = x0i - x2i;
			
 
				+        a[j0 - 2] = y0r + y2r;
			
 
				+        a[j0 - 1] = y0i - y2i;
			
 
				+        a[j1] = x0r - x2r;
			
 
				+        a[j1 + 1] = x0i + x2i;
			
 
				+        a[j1 - 2] = y0r - y2r;
			
 
				+        a[j1 - 1] = y0i + y2i;
			
 
				+        x0r = x1r + x3i;
			
 
				+        x0i = x1i + x3r;
			
 
				+        a[j2] = wk1i * x0r - wk1r * x0i;
			
 
				+        a[j2 + 1] = wk1i * x0i + wk1r * x0r;
			
 
				+        x0r = y1r + y3i;
			
 
				+        x0i = y1i + y3r;
			
 
				+        a[j2 - 2] = wd1i * x0r - wd1r * x0i;
			
 
				+        a[j2 - 1] = wd1i * x0i + wd1r * x0r;
			
 
				+        x0r = x1r - x3i;
			
 
				+        x0i = x1i - x3r;
			
 
				+        a[j3] = wk3i * x0r + wk3r * x0i;
			
 
				+        a[j3 + 1] = wk3i * x0i - wk3r * x0r;
			
 
				+        x0r = y1r - y3i;
			
 
				+        x0i = y1i - y3r;
			
 
				+        a[j3 - 2] = wd3i * x0r + wd3r * x0i;
			
 
				+        a[j3 - 1] = wd3i * x0i - wd3r * x0r;
			
 
				+    }
			
 
				+    wk1r = csc1 * (wd1r + wn4r);
			
 
				+    wk1i = csc1 * (wd1i + wn4r);
			
 
				+    wk3r = csc3 * (wd3r - wn4r);
			
 
				+    wk3i = csc3 * (wd3i - wn4r);
			
 
				+    j0 = mh;
			
 
				+    j1 = j0 + m;
			
 
				+    j2 = j1 + m;
			
 
				+    j3 = j2 + m;
			
 
				+    x0r = a[j0 - 2] + a[j2 - 2];
			
 
				+    x0i = -a[j0 - 1] - a[j2 - 1];
			
 
				+    x1r = a[j0 - 2] - a[j2 - 2];
			
 
				+    x1i = -a[j0 - 1] + a[j2 - 1];
			
 
				+    x2r = a[j1 - 2] + a[j3 - 2];
			
 
				+    x2i = a[j1 - 1] + a[j3 - 1];
			
 
				+    x3r = a[j1 - 2] - a[j3 - 2];
			
 
				+    x3i = a[j1 - 1] - a[j3 - 1];
			
 
				+    a[j0 - 2] = x0r + x2r;
			
 
				+    a[j0 - 1] = x0i - x2i;
			
 
				+    a[j1 - 2] = x0r - x2r;
			
 
				+    a[j1 - 1] = x0i + x2i;
			
 
				+    x0r = x1r + x3i;
			
 
				+    x0i = x1i + x3r;
			
 
				+    a[j2 - 2] = wk1r * x0r - wk1i * x0i;
			
 
				+    a[j2 - 1] = wk1r * x0i + wk1i * x0r;
			
 
				+    x0r = x1r - x3i;
			
 
				+    x0i = x1i - x3r;
			
 
				+    a[j3 - 2] = wk3r * x0r + wk3i * x0i;
			
 
				+    a[j3 - 1] = wk3r * x0i - wk3i * x0r;
			
 
				+    x0r = a[j0] + a[j2];
			
 
				+    x0i = -a[j0 + 1] - a[j2 + 1];
			
 
				+    x1r = a[j0] - a[j2];
			
 
				+    x1i = -a[j0 + 1] + a[j2 + 1];
			
 
				+    x2r = a[j1] + a[j3];
			
 
				+    x2i = a[j1 + 1] + a[j3 + 1];
			
 
				+    x3r = a[j1] - a[j3];
			
 
				+    x3i = a[j1 + 1] - a[j3 + 1];
			
 
				+    a[j0] = x0r + x2r;
			
 
				+    a[j0 + 1] = x0i - x2i;
			
 
				+    a[j1] = x0r - x2r;
			
 
				+    a[j1 + 1] = x0i + x2i;
			
 
				+    x0r = x1r + x3i;
			
 
				+    x0i = x1i + x3r;
			
 
				+    a[j2] = wn4r * (x0r - x0i);
			
 
				+    a[j2 + 1] = wn4r * (x0i + x0r);
			
 
				+    x0r = x1r - x3i;
			
 
				+    x0i = x1i - x3r;
			
 
				+    a[j3] = -wn4r * (x0r + x0i);
			
 
				+    a[j3 + 1] = -wn4r * (x0i - x0r);
			
 
				+    x0r = a[j0 + 2] + a[j2 + 2];
			
 
				+    x0i = -a[j0 + 3] - a[j2 + 3];
			
 
				+    x1r = a[j0 + 2] - a[j2 + 2];
			
 
				+    x1i = -a[j0 + 3] + a[j2 + 3];
			
 
				+    x2r = a[j1 + 2] + a[j3 + 2];
			
 
				+    x2i = a[j1 + 3] + a[j3 + 3];
			
 
				+    x3r = a[j1 + 2] - a[j3 + 2];
			
 
				+    x3i = a[j1 + 3] - a[j3 + 3];
			
 
				+    a[j0 + 2] = x0r + x2r;
			
 
				+    a[j0 + 3] = x0i - x2i;
			
 
				+    a[j1 + 2] = x0r - x2r;
			
 
				+    a[j1 + 3] = x0i + x2i;
			
 
				+    x0r = x1r + x3i;
			
 
				+    x0i = x1i + x3r;
			
 
				+    a[j2 + 2] = wk1i * x0r - wk1r * x0i;
			
 
				+    a[j2 + 3] = wk1i * x0i + wk1r * x0r;
			
 
				+    x0r = x1r - x3i;
			
 
				+    x0i = x1i - x3r;
			
 
				+    a[j3 + 2] = wk3i * x0r + wk3r * x0i;
			
 
				+    a[j3 + 3] = wk3i * x0i - wk3r * x0r;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+#ifdef USE_CDFT_THREADS
			
 
				+struct cdft_arg_st {
			
 
				+    int n0;
			
 
				+    int n;
			
 
				+    double *a;
			
 
				+    int nw;
			
 
				+    double *w;
			
 
				+};
			
 
				+typedef struct cdft_arg_st cdft_arg_t;
			
 
				+
			
 
				+
			
 
				+void cftrec4_th(int n, double *a, int nw, double *w)
			
 
				+{
			
 
				+    void *cftrec1_th(void *p);
			
 
				+    void *cftrec2_th(void *p);
			
 
				+    int i, idiv4, m, nthread;
			
 
				+    cdft_thread_t th[4];
			
 
				+    cdft_arg_t ag[4];
			
 
				+
			
 
				+    nthread = 2;
			
 
				+    idiv4 = 0;
			
 
				+    m = n >> 1;
			
 
				+    if (n > CDFT_4THREADS_BEGIN_N) {
			
 
				+        nthread = 4;
			
 
				+        idiv4 = 1;
			
 
				+        m >>= 1;
			
 
				+    }
			
 
				+    for (i = 0; i < nthread; i++) {
			
 
				+        ag[i].n0 = n;
			
 
				+        ag[i].n = m;
			
 
				+        ag[i].a = &a[i * m];
			
 
				+        ag[i].nw = nw;
			
 
				+        ag[i].w = w;
			
 
				+        if (i != idiv4) {
			
 
				+            cdft_thread_create(&th[i], cftrec1_th, &ag[i]);
			
 
				+        } else {
			
 
				+            cdft_thread_create(&th[i], cftrec2_th, &ag[i]);
			
 
				+        }
			
 
				+    }
			
 
				+    for (i = 0; i < nthread; i++) {
			
 
				+        cdft_thread_wait(th[i]);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void *cftrec1_th(void *p)
			
 
				+{
			
 
				+    int cfttree(int n, int j, int k, double *a, int nw, double *w);
			
 
				+    void cftleaf(int n, int isplt, double *a, int nw, double *w);
			
 
				+    void cftmdl1(int n, double *a, double *w);
			
 
				+    int isplt, j, k, m, n, n0, nw;
			
 
				+    double *a, *w;
			
 
				+
			
 
				+    n0 = ((cdft_arg_t *) p)->n0;
			
 
				+    n = ((cdft_arg_t *) p)->n;
			
 
				+    a = ((cdft_arg_t *) p)->a;
			
 
				+    nw = ((cdft_arg_t *) p)->nw;
			
 
				+    w = ((cdft_arg_t *) p)->w;
			
 
				+    m = n0;
			
 
				+    while (m > 512) {
			
 
				+        m >>= 2;
			
 
				+        cftmdl1(m, &a[n - m], &w[nw - (m >> 1)]);
			
 
				+    }
			
 
				+    cftleaf(m, 1, &a[n - m], nw, w);
			
 
				+    k = 0;
			
 
				+    for (j = n - m; j > 0; j -= m) {
			
 
				+        k++;
			
 
				+        isplt = cfttree(m, j, k, a, nw, w);
			
 
				+        cftleaf(m, isplt, &a[j - m], nw, w);
			
 
				+    }
			
 
				+    return (void *) 0;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void *cftrec2_th(void *p)
			
 
				+{
			
 
				+    int cfttree(int n, int j, int k, double *a, int nw, double *w);
			
 
				+    void cftleaf(int n, int isplt, double *a, int nw, double *w);
			
 
				+    void cftmdl2(int n, double *a, double *w);
			
 
				+    int isplt, j, k, m, n, n0, nw;
			
 
				+    double *a, *w;
			
 
				+
			
 
				+    n0 = ((cdft_arg_t *) p)->n0;
			
 
				+    n = ((cdft_arg_t *) p)->n;
			
 
				+    a = ((cdft_arg_t *) p)->a;
			
 
				+    nw = ((cdft_arg_t *) p)->nw;
			
 
				+    w = ((cdft_arg_t *) p)->w;
			
 
				+    k = 1;
			
 
				+    m = n0;
			
 
				+    while (m > 512) {
			
 
				+        m >>= 2;
			
 
				+        k <<= 2;
			
 
				+        cftmdl2(m, &a[n - m], &w[nw - m]);
			
 
				+    }
			
 
				+    cftleaf(m, 0, &a[n - m], nw, w);
			
 
				+    k >>= 1;
			
 
				+    for (j = n - m; j > 0; j -= m) {
			
 
				+        k++;
			
 
				+        isplt = cfttree(m, j, k, a, nw, w);
			
 
				+        cftleaf(m, isplt, &a[j - m], nw, w);
			
 
				+    }
			
 
				+    return (void *) 0;
			
 
				+}
			
 
				+#endif /* USE_CDFT_THREADS */
			
 
				+
			
 
				+
			
 
				+void cftrec4(int n, double *a, int nw, double *w)
			
 
				+{
			
 
				+    int cfttree(int n, int j, int k, double *a, int nw, double *w);
			
 
				+    void cftleaf(int n, int isplt, double *a, int nw, double *w);
			
 
				+    void cftmdl1(int n, double *a, double *w);
			
 
				+    int isplt, j, k, m;
			
 
				+
			
 
				+    m = n;
			
 
				+    while (m > 512) {
			
 
				+        m >>= 2;
			
 
				+        cftmdl1(m, &a[n - m], &w[nw - (m >> 1)]);
			
 
				+    }
			
 
				+    cftleaf(m, 1, &a[n - m], nw, w);
			
 
				+    k = 0;
			
 
				+    for (j = n - m; j > 0; j -= m) {
			
 
				+        k++;
			
 
				+        isplt = cfttree(m, j, k, a, nw, w);
			
 
				+        cftleaf(m, isplt, &a[j - m], nw, w);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+
			
 
				+int cfttree(int n, int j, int k, double *a, int nw, double *w)
			
 
				+{
			
 
				+    void cftmdl1(int n, double *a, double *w);
			
 
				+    void cftmdl2(int n, double *a, double *w);
			
 
				+    int i, isplt, m;
			
 
				+
			
 
				+    if ((k & 3) != 0) {
			
 
				+        isplt = k & 1;
			
 
				+        if (isplt != 0) {
			
 
				+            cftmdl1(n, &a[j - n], &w[nw - (n >> 1)]);
			
 
				+        } else {
			
 
				+            cftmdl2(n, &a[j - n], &w[nw - n]);
			
 
				+        }
			
 
				+    } else {
			
 
				+        m = n;
			
 
				+        for (i = k; (i & 3) == 0; i >>= 2) {
			
 
				+            m <<= 2;
			
 
				+        }
			
 
				+        isplt = i & 1;
			
 
				+        if (isplt != 0) {
			
 
				+            while (m > 128) {
			
 
				+                cftmdl1(m, &a[j - m], &w[nw - (m >> 1)]);
			
 
				+                m >>= 2;
			
 
				+            }
			
 
				+        } else {
			
 
				+            while (m > 128) {
			
 
				+                cftmdl2(m, &a[j - m], &w[nw - m]);
			
 
				+                m >>= 2;
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+    return isplt;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void cftleaf(int n, int isplt, double *a, int nw, double *w)
			
 
				+{
			
 
				+    void cftmdl1(int n, double *a, double *w);
			
 
				+    void cftmdl2(int n, double *a, double *w);
			
 
				+    void cftf161(double *a, double *w);
			
 
				+    void cftf162(double *a, double *w);
			
 
				+    void cftf081(double *a, double *w);
			
 
				+    void cftf082(double *a, double *w);
			
 
				+
			
 
				+    if (n == 512) {
			
 
				+        cftmdl1(128, a, &w[nw - 64]);
			
 
				+        cftf161(a, &w[nw - 8]);
			
 
				+        cftf162(&a[32], &w[nw - 32]);
			
 
				+        cftf161(&a[64], &w[nw - 8]);
			
 
				+        cftf161(&a[96], &w[nw - 8]);
			
 
				+        cftmdl2(128, &a[128], &w[nw - 128]);
			
 
				+        cftf161(&a[128], &w[nw - 8]);
			
 
				+        cftf162(&a[160], &w[nw - 32]);
			
 
				+        cftf161(&a[192], &w[nw - 8]);
			
 
				+        cftf162(&a[224], &w[nw - 32]);
			
 
				+        cftmdl1(128, &a[256], &w[nw - 64]);
			
 
				+        cftf161(&a[256], &w[nw - 8]);
			
 
				+        cftf162(&a[288], &w[nw - 32]);
			
 
				+        cftf161(&a[320], &w[nw - 8]);
			
 
				+        cftf161(&a[352], &w[nw - 8]);
			
 
				+        if (isplt != 0) {
			
 
				+            cftmdl1(128, &a[384], &w[nw - 64]);
			
 
				+            cftf161(&a[480], &w[nw - 8]);
			
 
				+        } else {
			
 
				+            cftmdl2(128, &a[384], &w[nw - 128]);
			
 
				+            cftf162(&a[480], &w[nw - 32]);
			
 
				+        }
			
 
				+        cftf161(&a[384], &w[nw - 8]);
			
 
				+        cftf162(&a[416], &w[nw - 32]);
			
 
				+        cftf161(&a[448], &w[nw - 8]);
			
 
				+    } else {
			
 
				+        cftmdl1(64, a, &w[nw - 32]);
			
 
				+        cftf081(a, &w[nw - 8]);
			
 
				+        cftf082(&a[16], &w[nw - 8]);
			
 
				+        cftf081(&a[32], &w[nw - 8]);
			
 
				+        cftf081(&a[48], &w[nw - 8]);
			
 
				+        cftmdl2(64, &a[64], &w[nw - 64]);
			
 
				+        cftf081(&a[64], &w[nw - 8]);
			
 
				+        cftf082(&a[80], &w[nw - 8]);
			
 
				+        cftf081(&a[96], &w[nw - 8]);
			
 
				+        cftf082(&a[112], &w[nw - 8]);
			
 
				+        cftmdl1(64, &a[128], &w[nw - 32]);
			
 
				+        cftf081(&a[128], &w[nw - 8]);
			
 
				+        cftf082(&a[144], &w[nw - 8]);
			
 
				+        cftf081(&a[160], &w[nw - 8]);
			
 
				+        cftf081(&a[176], &w[nw - 8]);
			
 
				+        if (isplt != 0) {
			
 
				+            cftmdl1(64, &a[192], &w[nw - 32]);
			
 
				+            cftf081(&a[240], &w[nw - 8]);
			
 
				+        } else {
			
 
				+            cftmdl2(64, &a[192], &w[nw - 64]);
			
 
				+            cftf082(&a[240], &w[nw - 8]);
			
 
				+        }
			
 
				+        cftf081(&a[192], &w[nw - 8]);
			
 
				+        cftf082(&a[208], &w[nw - 8]);
			
 
				+        cftf081(&a[224], &w[nw - 8]);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void cftmdl1(int n, double *a, double *w)
			
 
				+{
			
 
				+    int j, j0, j1, j2, j3, k, m, mh;
			
 
				+    double wn4r, wk1r, wk1i, wk3r, wk3i;
			
 
				+    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
			
 
				+
			
 
				+    mh = n >> 3;
			
 
				+    m = 2 * mh;
			
 
				+    j1 = m;
			
 
				+    j2 = j1 + m;
			
 
				+    j3 = j2 + m;
			
 
				+    x0r = a[0] + a[j2];
			
 
				+    x0i = a[1] + a[j2 + 1];
			
 
				+    x1r = a[0] - a[j2];
			
 
				+    x1i = a[1] - a[j2 + 1];
			
 
				+    x2r = a[j1] + a[j3];
			
 
				+    x2i = a[j1 + 1] + a[j3 + 1];
			
 
				+    x3r = a[j1] - a[j3];
			
 
				+    x3i = a[j1 + 1] - a[j3 + 1];
			
 
				+    a[0] = x0r + x2r;
			
 
				+    a[1] = x0i + x2i;
			
 
				+    a[j1] = x0r - x2r;
			
 
				+    a[j1 + 1] = x0i - x2i;
			
 
				+    a[j2] = x1r - x3i;
			
 
				+    a[j2 + 1] = x1i + x3r;
			
 
				+    a[j3] = x1r + x3i;
			
 
				+    a[j3 + 1] = x1i - x3r;
			
 
				+    wn4r = w[1];
			
 
				+    k = 0;
			
 
				+    for (j = 2; j < mh; j += 2) {
			
 
				+        k += 4;
			
 
				+        wk1r = w[k];
			
 
				+        wk1i = w[k + 1];
			
 
				+        wk3r = w[k + 2];
			
 
				+        wk3i = w[k + 3];
			
 
				+        j1 = j + m;
			
 
				+        j2 = j1 + m;
			
 
				+        j3 = j2 + m;
			
 
				+        x0r = a[j] + a[j2];
			
 
				+        x0i = a[j + 1] + a[j2 + 1];
			
 
				+        x1r = a[j] - a[j2];
			
 
				+        x1i = a[j + 1] - a[j2 + 1];
			
 
				+        x2r = a[j1] + a[j3];
			
 
				+        x2i = a[j1 + 1] + a[j3 + 1];
			
 
				+        x3r = a[j1] - a[j3];
			
 
				+        x3i = a[j1 + 1] - a[j3 + 1];
			
 
				+        a[j] = x0r + x2r;
			
 
				+        a[j + 1] = x0i + x2i;
			
 
				+        a[j1] = x0r - x2r;
			
 
				+        a[j1 + 1] = x0i - x2i;
			
 
				+        x0r = x1r - x3i;
			
 
				+        x0i = x1i + x3r;
			
 
				+        a[j2] = wk1r * x0r - wk1i * x0i;
			
 
				+        a[j2 + 1] = wk1r * x0i + wk1i * x0r;
			
 
				+        x0r = x1r + x3i;
			
 
				+        x0i = x1i - x3r;
			
 
				+        a[j3] = wk3r * x0r + wk3i * x0i;
			
 
				+        a[j3 + 1] = wk3r * x0i - wk3i * x0r;
			
 
				+        j0 = m - j;
			
 
				+        j1 = j0 + m;
			
 
				+        j2 = j1 + m;
			
 
				+        j3 = j2 + m;
			
 
				+        x0r = a[j0] + a[j2];
			
 
				+        x0i = a[j0 + 1] + a[j2 + 1];
			
 
				+        x1r = a[j0] - a[j2];
			
 
				+        x1i = a[j0 + 1] - a[j2 + 1];
			
 
				+        x2r = a[j1] + a[j3];
			
 
				+        x2i = a[j1 + 1] + a[j3 + 1];
			
 
				+        x3r = a[j1] - a[j3];
			
 
				+        x3i = a[j1 + 1] - a[j3 + 1];
			
 
				+        a[j0] = x0r + x2r;
			
 
				+        a[j0 + 1] = x0i + x2i;
			
 
				+        a[j1] = x0r - x2r;
			
 
				+        a[j1 + 1] = x0i - x2i;
			
 
				+        x0r = x1r - x3i;
			
 
				+        x0i = x1i + x3r;
			
 
				+        a[j2] = wk1i * x0r - wk1r * x0i;
			
 
				+        a[j2 + 1] = wk1i * x0i + wk1r * x0r;
			
 
				+        x0r = x1r + x3i;
			
 
				+        x0i = x1i - x3r;
			
 
				+        a[j3] = wk3i * x0r + wk3r * x0i;
			
 
				+        a[j3 + 1] = wk3i * x0i - wk3r * x0r;
			
 
				+    }
			
 
				+    j0 = mh;
			
 
				+    j1 = j0 + m;
			
 
				+    j2 = j1 + m;
			
 
				+    j3 = j2 + m;
			
 
				+    x0r = a[j0] + a[j2];
			
 
				+    x0i = a[j0 + 1] + a[j2 + 1];
			
 
				+    x1r = a[j0] - a[j2];
			
 
				+    x1i = a[j0 + 1] - a[j2 + 1];
			
 
				+    x2r = a[j1] + a[j3];
			
 
				+    x2i = a[j1 + 1] + a[j3 + 1];
			
 
				+    x3r = a[j1] - a[j3];
			
 
				+    x3i = a[j1 + 1] - a[j3 + 1];
			
 
				+    a[j0] = x0r + x2r;
			
 
				+    a[j0 + 1] = x0i + x2i;
			
 
				+    a[j1] = x0r - x2r;
			
 
				+    a[j1 + 1] = x0i - x2i;
			
 
				+    x0r = x1r - x3i;
			
 
				+    x0i = x1i + x3r;
			
 
				+    a[j2] = wn4r * (x0r - x0i);
			
 
				+    a[j2 + 1] = wn4r * (x0i + x0r);
			
 
				+    x0r = x1r + x3i;
			
 
				+    x0i = x1i - x3r;
			
 
				+    a[j3] = -wn4r * (x0r + x0i);
			
 
				+    a[j3 + 1] = -wn4r * (x0i - x0r);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void cftmdl2(int n, double *a, double *w)
			
 
				+{
			
 
				+    int j, j0, j1, j2, j3, k, kr, m, mh;
			
 
				+    double wn4r, wk1r, wk1i, wk3r, wk3i, wd1r, wd1i, wd3r, wd3i;
			
 
				+    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i, y0r, y0i, y2r, y2i;
			
 
				+
			
 
				+    mh = n >> 3;
			
 
				+    m = 2 * mh;
			
 
				+    wn4r = w[1];
			
 
				+    j1 = m;
			
 
				+    j2 = j1 + m;
			
 
				+    j3 = j2 + m;
			
 
				+    x0r = a[0] - a[j2 + 1];
			
 
				+    x0i = a[1] + a[j2];
			
 
				+    x1r = a[0] + a[j2 + 1];
			
 
				+    x1i = a[1] - a[j2];
			
 
				+    x2r = a[j1] - a[j3 + 1];
			
 
				+    x2i = a[j1 + 1] + a[j3];
			
 
				+    x3r = a[j1] + a[j3 + 1];
			
 
				+    x3i = a[j1 + 1] - a[j3];
			
 
				+    y0r = wn4r * (x2r - x2i);
			
 
				+    y0i = wn4r * (x2i + x2r);
			
 
				+    a[0] = x0r + y0r;
			
 
				+    a[1] = x0i + y0i;
			
 
				+    a[j1] = x0r - y0r;
			
 
				+    a[j1 + 1] = x0i - y0i;
			
 
				+    y0r = wn4r * (x3r - x3i);
			
 
				+    y0i = wn4r * (x3i + x3r);
			
 
				+    a[j2] = x1r - y0i;
			
 
				+    a[j2 + 1] = x1i + y0r;
			
 
				+    a[j3] = x1r + y0i;
			
 
				+    a[j3 + 1] = x1i - y0r;
			
 
				+    k = 0;
			
 
				+    kr = 2 * m;
			
 
				+    for (j = 2; j < mh; j += 2) {
			
 
				+        k += 4;
			
 
				+        wk1r = w[k];
			
 
				+        wk1i = w[k + 1];
			
 
				+        wk3r = w[k + 2];
			
 
				+        wk3i = w[k + 3];
			
 
				+        kr -= 4;
			
 
				+        wd1i = w[kr];
			
 
				+        wd1r = w[kr + 1];
			
 
				+        wd3i = w[kr + 2];
			
 
				+        wd3r = w[kr + 3];
			
 
				+        j1 = j + m;
			
 
				+        j2 = j1 + m;
			
 
				+        j3 = j2 + m;
			
 
				+        x0r = a[j] - a[j2 + 1];
			
 
				+        x0i = a[j + 1] + a[j2];
			
 
				+        x1r = a[j] + a[j2 + 1];
			
 
				+        x1i = a[j + 1] - a[j2];
			
 
				+        x2r = a[j1] - a[j3 + 1];
			
 
				+        x2i = a[j1 + 1] + a[j3];
			
 
				+        x3r = a[j1] + a[j3 + 1];
			
 
				+        x3i = a[j1 + 1] - a[j3];
			
 
				+        y0r = wk1r * x0r - wk1i * x0i;
			
 
				+        y0i = wk1r * x0i + wk1i * x0r;
			
 
				+        y2r = wd1r * x2r - wd1i * x2i;
			
 
				+        y2i = wd1r * x2i + wd1i * x2r;
			
 
				+        a[j] = y0r + y2r;
			
 
				+        a[j + 1] = y0i + y2i;
			
 
				+        a[j1] = y0r - y2r;
			
 
				+        a[j1 + 1] = y0i - y2i;
			
 
				+        y0r = wk3r * x1r + wk3i * x1i;
			
 
				+        y0i = wk3r * x1i - wk3i * x1r;
			
 
				+        y2r = wd3r * x3r + wd3i * x3i;
			
 
				+        y2i = wd3r * x3i - wd3i * x3r;
			
 
				+        a[j2] = y0r + y2r;
			
 
				+        a[j2 + 1] = y0i + y2i;
			
 
				+        a[j3] = y0r - y2r;
			
 
				+        a[j3 + 1] = y0i - y2i;
			
 
				+        j0 = m - j;
			
 
				+        j1 = j0 + m;
			
 
				+        j2 = j1 + m;
			
 
				+        j3 = j2 + m;
			
 
				+        x0r = a[j0] - a[j2 + 1];
			
 
				+        x0i = a[j0 + 1] + a[j2];
			
 
				+        x1r = a[j0] + a[j2 + 1];
			
 
				+        x1i = a[j0 + 1] - a[j2];
			
 
				+        x2r = a[j1] - a[j3 + 1];
			
 
				+        x2i = a[j1 + 1] + a[j3];
			
 
				+        x3r = a[j1] + a[j3 + 1];
			
 
				+        x3i = a[j1 + 1] - a[j3];
			
 
				+        y0r = wd1i * x0r - wd1r * x0i;
			
 
				+        y0i = wd1i * x0i + wd1r * x0r;
			
 
				+        y2r = wk1i * x2r - wk1r * x2i;
			
 
				+        y2i = wk1i * x2i + wk1r * x2r;
			
 
				+        a[j0] = y0r + y2r;
			
 
				+        a[j0 + 1] = y0i + y2i;
			
 
				+        a[j1] = y0r - y2r;
			
 
				+        a[j1 + 1] = y0i - y2i;
			
 
				+        y0r = wd3i * x1r + wd3r * x1i;
			
 
				+        y0i = wd3i * x1i - wd3r * x1r;
			
 
				+        y2r = wk3i * x3r + wk3r * x3i;
			
 
				+        y2i = wk3i * x3i - wk3r * x3r;
			
 
				+        a[j2] = y0r + y2r;
			
 
				+        a[j2 + 1] = y0i + y2i;
			
 
				+        a[j3] = y0r - y2r;
			
 
				+        a[j3 + 1] = y0i - y2i;
			
 
				+    }
			
 
				+    wk1r = w[m];
			
 
				+    wk1i = w[m + 1];
			
 
				+    j0 = mh;
			
 
				+    j1 = j0 + m;
			
 
				+    j2 = j1 + m;
			
 
				+    j3 = j2 + m;
			
 
				+    x0r = a[j0] - a[j2 + 1];
			
 
				+    x0i = a[j0 + 1] + a[j2];
			
 
				+    x1r = a[j0] + a[j2 + 1];
			
 
				+    x1i = a[j0 + 1] - a[j2];
			
 
				+    x2r = a[j1] - a[j3 + 1];
			
 
				+    x2i = a[j1 + 1] + a[j3];
			
 
				+    x3r = a[j1] + a[j3 + 1];
			
 
				+    x3i = a[j1 + 1] - a[j3];
			
 
				+    y0r = wk1r * x0r - wk1i * x0i;
			
 
				+    y0i = wk1r * x0i + wk1i * x0r;
			
 
				+    y2r = wk1i * x2r - wk1r * x2i;
			
 
				+    y2i = wk1i * x2i + wk1r * x2r;
			
 
				+    a[j0] = y0r + y2r;
			
 
				+    a[j0 + 1] = y0i + y2i;
			
 
				+    a[j1] = y0r - y2r;
			
 
				+    a[j1 + 1] = y0i - y2i;
			
 
				+    y0r = wk1i * x1r - wk1r * x1i;
			
 
				+    y0i = wk1i * x1i + wk1r * x1r;
			
 
				+    y2r = wk1r * x3r - wk1i * x3i;
			
 
				+    y2i = wk1r * x3i + wk1i * x3r;
			
 
				+    a[j2] = y0r - y2r;
			
 
				+    a[j2 + 1] = y0i - y2i;
			
 
				+    a[j3] = y0r + y2r;
			
 
				+    a[j3 + 1] = y0i + y2i;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void cftfx41(int n, double *a, int nw, double *w)
			
 
				+{
			
 
				+    void cftf161(double *a, double *w);
			
 
				+    void cftf162(double *a, double *w);
			
 
				+    void cftf081(double *a, double *w);
			
 
				+    void cftf082(double *a, double *w);
			
 
				+
			
 
				+    if (n == 128) {
			
 
				+        cftf161(a, &w[nw - 8]);
			
 
				+        cftf162(&a[32], &w[nw - 32]);
			
 
				+        cftf161(&a[64], &w[nw - 8]);
			
 
				+        cftf161(&a[96], &w[nw - 8]);
			
 
				+    } else {
			
 
				+        cftf081(a, &w[nw - 8]);
			
 
				+        cftf082(&a[16], &w[nw - 8]);
			
 
				+        cftf081(&a[32], &w[nw - 8]);
			
 
				+        cftf081(&a[48], &w[nw - 8]);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void cftf161(double *a, double *w)
			
 
				+{
			
 
				+    double wn4r, wk1r, wk1i,
			
 
				+        x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i,
			
 
				+        y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i,
			
 
				+        y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i,
			
 
				+        y8r, y8i, y9r, y9i, y10r, y10i, y11r, y11i,
			
 
				+        y12r, y12i, y13r, y13i, y14r, y14i, y15r, y15i;
			
 
				+
			
 
				+    wn4r = w[1];
			
 
				+    wk1r = w[2];
			
 
				+    wk1i = w[3];
			
 
				+    x0r = a[0] + a[16];
			
 
				+    x0i = a[1] + a[17];
			
 
				+    x1r = a[0] - a[16];
			
 
				+    x1i = a[1] - a[17];
			
 
				+    x2r = a[8] + a[24];
			
 
				+    x2i = a[9] + a[25];
			
 
				+    x3r = a[8] - a[24];
			
 
				+    x3i = a[9] - a[25];
			
 
				+    y0r = x0r + x2r;
			
 
				+    y0i = x0i + x2i;
			
 
				+    y4r = x0r - x2r;
			
 
				+    y4i = x0i - x2i;
			
 
				+    y8r = x1r - x3i;
			
 
				+    y8i = x1i + x3r;
			
 
				+    y12r = x1r + x3i;
			
 
				+    y12i = x1i - x3r;
			
 
				+    x0r = a[2] + a[18];
			
 
				+    x0i = a[3] + a[19];
			
 
				+    x1r = a[2] - a[18];
			
 
				+    x1i = a[3] - a[19];
			
 
				+    x2r = a[10] + a[26];
			
 
				+    x2i = a[11] + a[27];
			
 
				+    x3r = a[10] - a[26];
			
 
				+    x3i = a[11] - a[27];
			
 
				+    y1r = x0r + x2r;
			
 
				+    y1i = x0i + x2i;
			
 
				+    y5r = x0r - x2r;
			
 
				+    y5i = x0i - x2i;
			
 
				+    x0r = x1r - x3i;
			
 
				+    x0i = x1i + x3r;
			
 
				+    y9r = wk1r * x0r - wk1i * x0i;
			
 
				+    y9i = wk1r * x0i + wk1i * x0r;
			
 
				+    x0r = x1r + x3i;
			
 
				+    x0i = x1i - x3r;
			
 
				+    y13r = wk1i * x0r - wk1r * x0i;
			
 
				+    y13i = wk1i * x0i + wk1r * x0r;
			
 
				+    x0r = a[4] + a[20];
			
 
				+    x0i = a[5] + a[21];
			
 
				+    x1r = a[4] - a[20];
			
 
				+    x1i = a[5] - a[21];
			
 
				+    x2r = a[12] + a[28];
			
 
				+    x2i = a[13] + a[29];
			
 
				+    x3r = a[12] - a[28];
			
 
				+    x3i = a[13] - a[29];
			
 
				+    y2r = x0r + x2r;
			
 
				+    y2i = x0i + x2i;
			
 
				+    y6r = x0r - x2r;
			
 
				+    y6i = x0i - x2i;
			
 
				+    x0r = x1r - x3i;
			
 
				+    x0i = x1i + x3r;
			
 
				+    y10r = wn4r * (x0r - x0i);
			
 
				+    y10i = wn4r * (x0i + x0r);
			
 
				+    x0r = x1r + x3i;
			
 
				+    x0i = x1i - x3r;
			
 
				+    y14r = wn4r * (x0r + x0i);
			
 
				+    y14i = wn4r * (x0i - x0r);
			
 
				+    x0r = a[6] + a[22];
			
 
				+    x0i = a[7] + a[23];
			
 
				+    x1r = a[6] - a[22];
			
 
				+    x1i = a[7] - a[23];
			
 
				+    x2r = a[14] + a[30];
			
 
				+    x2i = a[15] + a[31];
			
 
				+    x3r = a[14] - a[30];
			
 
				+    x3i = a[15] - a[31];
			
 
				+    y3r = x0r + x2r;
			
 
				+    y3i = x0i + x2i;
			
 
				+    y7r = x0r - x2r;
			
 
				+    y7i = x0i - x2i;
			
 
				+    x0r = x1r - x3i;
			
 
				+    x0i = x1i + x3r;
			
 
				+    y11r = wk1i * x0r - wk1r * x0i;
			
 
				+    y11i = wk1i * x0i + wk1r * x0r;
			
 
				+    x0r = x1r + x3i;
			
 
				+    x0i = x1i - x3r;
			
 
				+    y15r = wk1r * x0r - wk1i * x0i;
			
 
				+    y15i = wk1r * x0i + wk1i * x0r;
			
 
				+    x0r = y12r - y14r;
			
 
				+    x0i = y12i - y14i;
			
 
				+    x1r = y12r + y14r;
			
 
				+    x1i = y12i + y14i;
			
 
				+    x2r = y13r - y15r;
			
 
				+    x2i = y13i - y15i;
			
 
				+    x3r = y13r + y15r;
			
 
				+    x3i = y13i + y15i;
			
 
				+    a[24] = x0r + x2r;
			
 
				+    a[25] = x0i + x2i;
			
 
				+    a[26] = x0r - x2r;
			
 
				+    a[27] = x0i - x2i;
			
 
				+    a[28] = x1r - x3i;
			
 
				+    a[29] = x1i + x3r;
			
 
				+    a[30] = x1r + x3i;
			
 
				+    a[31] = x1i - x3r;
			
 
				+    x0r = y8r + y10r;
			
 
				+    x0i = y8i + y10i;
			
 
				+    x1r = y8r - y10r;
			
 
				+    x1i = y8i - y10i;
			
 
				+    x2r = y9r + y11r;
			
 
				+    x2i = y9i + y11i;
			
 
				+    x3r = y9r - y11r;
			
 
				+    x3i = y9i - y11i;
			
 
				+    a[16] = x0r + x2r;
			
 
				+    a[17] = x0i + x2i;
			
 
				+    a[18] = x0r - x2r;
			
 
				+    a[19] = x0i - x2i;
			
 
				+    a[20] = x1r - x3i;
			
 
				+    a[21] = x1i + x3r;
			
 
				+    a[22] = x1r + x3i;
			
 
				+    a[23] = x1i - x3r;
			
 
				+    x0r = y5r - y7i;
			
 
				+    x0i = y5i + y7r;
			
 
				+    x2r = wn4r * (x0r - x0i);
			
 
				+    x2i = wn4r * (x0i + x0r);
			
 
				+    x0r = y5r + y7i;
			
 
				+    x0i = y5i - y7r;
			
 
				+    x3r = wn4r * (x0r - x0i);
			
 
				+    x3i = wn4r * (x0i + x0r);
			
 
				+    x0r = y4r - y6i;
			
 
				+    x0i = y4i + y6r;
			
 
				+    x1r = y4r + y6i;
			
 
				+    x1i = y4i - y6r;
			
 
				+    a[8] = x0r + x2r;
			
 
				+    a[9] = x0i + x2i;
			
 
				+    a[10] = x0r - x2r;
			
 
				+    a[11] = x0i - x2i;
			
 
				+    a[12] = x1r - x3i;
			
 
				+    a[13] = x1i + x3r;
			
 
				+    a[14] = x1r + x3i;
			
 
				+    a[15] = x1i - x3r;
			
 
				+    x0r = y0r + y2r;
			
 
				+    x0i = y0i + y2i;
			
 
				+    x1r = y0r - y2r;
			
 
				+    x1i = y0i - y2i;
			
 
				+    x2r = y1r + y3r;
			
 
				+    x2i = y1i + y3i;
			
 
				+    x3r = y1r - y3r;
			
 
				+    x3i = y1i - y3i;
			
 
				+    a[0] = x0r + x2r;
			
 
				+    a[1] = x0i + x2i;
			
 
				+    a[2] = x0r - x2r;
			
 
				+    a[3] = x0i - x2i;
			
 
				+    a[4] = x1r - x3i;
			
 
				+    a[5] = x1i + x3r;
			
 
				+    a[6] = x1r + x3i;
			
 
				+    a[7] = x1i - x3r;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void cftf162(double *a, double *w)
			
 
				+{
			
 
				+    double wn4r, wk1r, wk1i, wk2r, wk2i, wk3r, wk3i,
			
 
				+        x0r, x0i, x1r, x1i, x2r, x2i,
			
 
				+        y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i,
			
 
				+        y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i,
			
 
				+        y8r, y8i, y9r, y9i, y10r, y10i, y11r, y11i,
			
 
				+        y12r, y12i, y13r, y13i, y14r, y14i, y15r, y15i;
			
 
				+
			
 
				+    wn4r = w[1];
			
 
				+    wk1r = w[4];
			
 
				+    wk1i = w[5];
			
 
				+    wk3r = w[6];
			
 
				+    wk3i = -w[7];
			
 
				+    wk2r = w[8];
			
 
				+    wk2i = w[9];
			
 
				+    x1r = a[0] - a[17];
			
 
				+    x1i = a[1] + a[16];
			
 
				+    x0r = a[8] - a[25];
			
 
				+    x0i = a[9] + a[24];
			
 
				+    x2r = wn4r * (x0r - x0i);
			
 
				+    x2i = wn4r * (x0i + x0r);
			
 
				+    y0r = x1r + x2r;
			
 
				+    y0i = x1i + x2i;
			
 
				+    y4r = x1r - x2r;
			
 
				+    y4i = x1i - x2i;
			
 
				+    x1r = a[0] + a[17];
			
 
				+    x1i = a[1] - a[16];
			
 
				+    x0r = a[8] + a[25];
			
 
				+    x0i = a[9] - a[24];
			
 
				+    x2r = wn4r * (x0r - x0i);
			
 
				+    x2i = wn4r * (x0i + x0r);
			
 
				+    y8r = x1r - x2i;
			
 
				+    y8i = x1i + x2r;
			
 
				+    y12r = x1r + x2i;
			
 
				+    y12i = x1i - x2r;
			
 
				+    x0r = a[2] - a[19];
			
 
				+    x0i = a[3] + a[18];
			
 
				+    x1r = wk1r * x0r - wk1i * x0i;
			
 
				+    x1i = wk1r * x0i + wk1i * x0r;
			
 
				+    x0r = a[10] - a[27];
			
 
				+    x0i = a[11] + a[26];
			
 
				+    x2r = wk3i * x0r - wk3r * x0i;
			
 
				+    x2i = wk3i * x0i + wk3r * x0r;
			
 
				+    y1r = x1r + x2r;
			
 
				+    y1i = x1i + x2i;
			
 
				+    y5r = x1r - x2r;
			
 
				+    y5i = x1i - x2i;
			
 
				+    x0r = a[2] + a[19];
			
 
				+    x0i = a[3] - a[18];
			
 
				+    x1r = wk3r * x0r - wk3i * x0i;
			
 
				+    x1i = wk3r * x0i + wk3i * x0r;
			
 
				+    x0r = a[10] + a[27];
			
 
				+    x0i = a[11] - a[26];
			
 
				+    x2r = wk1r * x0r + wk1i * x0i;
			
 
				+    x2i = wk1r * x0i - wk1i * x0r;
			
 
				+    y9r = x1r - x2r;
			
 
				+    y9i = x1i - x2i;
			
 
				+    y13r = x1r + x2r;
			
 
				+    y13i = x1i + x2i;
			
 
				+    x0r = a[4] - a[21];
			
 
				+    x0i = a[5] + a[20];
			
 
				+    x1r = wk2r * x0r - wk2i * x0i;
			
 
				+    x1i = wk2r * x0i + wk2i * x0r;
			
 
				+    x0r = a[12] - a[29];
			
 
				+    x0i = a[13] + a[28];
			
 
				+    x2r = wk2i * x0r - wk2r * x0i;
			
 
				+    x2i = wk2i * x0i + wk2r * x0r;
			
 
				+    y2r = x1r + x2r;
			
 
				+    y2i = x1i + x2i;
			
 
				+    y6r = x1r - x2r;
			
 
				+    y6i = x1i - x2i;
			
 
				+    x0r = a[4] + a[21];
			
 
				+    x0i = a[5] - a[20];
			
 
				+    x1r = wk2i * x0r - wk2r * x0i;
			
 
				+    x1i = wk2i * x0i + wk2r * x0r;
			
 
				+    x0r = a[12] + a[29];
			
 
				+    x0i = a[13] - a[28];
			
 
				+    x2r = wk2r * x0r - wk2i * x0i;
			
 
				+    x2i = wk2r * x0i + wk2i * x0r;
			
 
				+    y10r = x1r - x2r;
			
 
				+    y10i = x1i - x2i;
			
 
				+    y14r = x1r + x2r;
			
 
				+    y14i = x1i + x2i;
			
 
				+    x0r = a[6] - a[23];
			
 
				+    x0i = a[7] + a[22];
			
 
				+    x1r = wk3r * x0r - wk3i * x0i;
			
 
				+    x1i = wk3r * x0i + wk3i * x0r;
			
 
				+    x0r = a[14] - a[31];
			
 
				+    x0i = a[15] + a[30];
			
 
				+    x2r = wk1i * x0r - wk1r * x0i;
			
 
				+    x2i = wk1i * x0i + wk1r * x0r;
			
 
				+    y3r = x1r + x2r;
			
 
				+    y3i = x1i + x2i;
			
 
				+    y7r = x1r - x2r;
			
 
				+    y7i = x1i - x2i;
			
 
				+    x0r = a[6] + a[23];
			
 
				+    x0i = a[7] - a[22];
			
 
				+    x1r = wk1i * x0r + wk1r * x0i;
			
 
				+    x1i = wk1i * x0i - wk1r * x0r;
			
 
				+    x0r = a[14] + a[31];
			
 
				+    x0i = a[15] - a[30];
			
 
				+    x2r = wk3i * x0r - wk3r * x0i;
			
 
				+    x2i = wk3i * x0i + wk3r * x0r;
			
 
				+    y11r = x1r + x2r;
			
 
				+    y11i = x1i + x2i;
			
 
				+    y15r = x1r - x2r;
			
 
				+    y15i = x1i - x2i;
			
 
				+    x1r = y0r + y2r;
			
 
				+    x1i = y0i + y2i;
			
 
				+    x2r = y1r + y3r;
			
 
				+    x2i = y1i + y3i;
			
 
				+    a[0] = x1r + x2r;
			
 
				+    a[1] = x1i + x2i;
			
 
				+    a[2] = x1r - x2r;
			
 
				+    a[3] = x1i - x2i;
			
 
				+    x1r = y0r - y2r;
			
 
				+    x1i = y0i - y2i;
			
 
				+    x2r = y1r - y3r;
			
 
				+    x2i = y1i - y3i;
			
 
				+    a[4] = x1r - x2i;
			
 
				+    a[5] = x1i + x2r;
			
 
				+    a[6] = x1r + x2i;
			
 
				+    a[7] = x1i - x2r;
			
 
				+    x1r = y4r - y6i;
			
 
				+    x1i = y4i + y6r;
			
 
				+    x0r = y5r - y7i;
			
 
				+    x0i = y5i + y7r;
			
 
				+    x2r = wn4r * (x0r - x0i);
			
 
				+    x2i = wn4r * (x0i + x0r);
			
 
				+    a[8] = x1r + x2r;
			
 
				+    a[9] = x1i + x2i;
			
 
				+    a[10] = x1r - x2r;
			
 
				+    a[11] = x1i - x2i;
			
 
				+    x1r = y4r + y6i;
			
 
				+    x1i = y4i - y6r;
			
 
				+    x0r = y5r + y7i;
			
 
				+    x0i = y5i - y7r;
			
 
				+    x2r = wn4r * (x0r - x0i);
			
 
				+    x2i = wn4r * (x0i + x0r);
			
 
				+    a[12] = x1r - x2i;
			
 
				+    a[13] = x1i + x2r;
			
 
				+    a[14] = x1r + x2i;
			
 
				+    a[15] = x1i - x2r;
			
 
				+    x1r = y8r + y10r;
			
 
				+    x1i = y8i + y10i;
			
 
				+    x2r = y9r - y11r;
			
 
				+    x2i = y9i - y11i;
			
 
				+    a[16] = x1r + x2r;
			
 
				+    a[17] = x1i + x2i;
			
 
				+    a[18] = x1r - x2r;
			
 
				+    a[19] = x1i - x2i;
			
 
				+    x1r = y8r - y10r;
			
 
				+    x1i = y8i - y10i;
			
 
				+    x2r = y9r + y11r;
			
 
				+    x2i = y9i + y11i;
			
 
				+    a[20] = x1r - x2i;
			
 
				+    a[21] = x1i + x2r;
			
 
				+    a[22] = x1r + x2i;
			
 
				+    a[23] = x1i - x2r;
			
 
				+    x1r = y12r - y14i;
			
 
				+    x1i = y12i + y14r;
			
 
				+    x0r = y13r + y15i;
			
 
				+    x0i = y13i - y15r;
			
 
				+    x2r = wn4r * (x0r - x0i);
			
 
				+    x2i = wn4r * (x0i + x0r);
			
 
				+    a[24] = x1r + x2r;
			
 
				+    a[25] = x1i + x2i;
			
 
				+    a[26] = x1r - x2r;
			
 
				+    a[27] = x1i - x2i;
			
 
				+    x1r = y12r + y14i;
			
 
				+    x1i = y12i - y14r;
			
 
				+    x0r = y13r - y15i;
			
 
				+    x0i = y13i + y15r;
			
 
				+    x2r = wn4r * (x0r - x0i);
			
 
				+    x2i = wn4r * (x0i + x0r);
			
 
				+    a[28] = x1r - x2i;
			
 
				+    a[29] = x1i + x2r;
			
 
				+    a[30] = x1r + x2i;
			
 
				+    a[31] = x1i - x2r;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void cftf081(double *a, double *w)
			
 
				+{
			
 
				+    double wn4r, x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i,
			
 
				+        y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i,
			
 
				+        y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i;
			
 
				+
			
 
				+    wn4r = w[1];
			
 
				+    x0r = a[0] + a[8];
			
 
				+    x0i = a[1] + a[9];
			
 
				+    x1r = a[0] - a[8];
			
 
				+    x1i = a[1] - a[9];
			
 
				+    x2r = a[4] + a[12];
			
 
				+    x2i = a[5] + a[13];
			
 
				+    x3r = a[4] - a[12];
			
 
				+    x3i = a[5] - a[13];
			
 
				+    y0r = x0r + x2r;
			
 
				+    y0i = x0i + x2i;
			
 
				+    y2r = x0r - x2r;
			
 
				+    y2i = x0i - x2i;
			
 
				+    y1r = x1r - x3i;
			
 
				+    y1i = x1i + x3r;
			
 
				+    y3r = x1r + x3i;
			
 
				+    y3i = x1i - x3r;
			
 
				+    x0r = a[2] + a[10];
			
 
				+    x0i = a[3] + a[11];
			
 
				+    x1r = a[2] - a[10];
			
 
				+    x1i = a[3] - a[11];
			
 
				+    x2r = a[6] + a[14];
			
 
				+    x2i = a[7] + a[15];
			
 
				+    x3r = a[6] - a[14];
			
 
				+    x3i = a[7] - a[15];
			
 
				+    y4r = x0r + x2r;
			
 
				+    y4i = x0i + x2i;
			
 
				+    y6r = x0r - x2r;
			
 
				+    y6i = x0i - x2i;
			
 
				+    x0r = x1r - x3i;
			
 
				+    x0i = x1i + x3r;
			
 
				+    x2r = x1r + x3i;
			
 
				+    x2i = x1i - x3r;
			
 
				+    y5r = wn4r * (x0r - x0i);
			
 
				+    y5i = wn4r * (x0r + x0i);
			
 
				+    y7r = wn4r * (x2r - x2i);
			
 
				+    y7i = wn4r * (x2r + x2i);
			
 
				+    a[8] = y1r + y5r;
			
 
				+    a[9] = y1i + y5i;
			
 
				+    a[10] = y1r - y5r;
			
 
				+    a[11] = y1i - y5i;
			
 
				+    a[12] = y3r - y7i;
			
 
				+    a[13] = y3i + y7r;
			
 
				+    a[14] = y3r + y7i;
			
 
				+    a[15] = y3i - y7r;
			
 
				+    a[0] = y0r + y4r;
			
 
				+    a[1] = y0i + y4i;
			
 
				+    a[2] = y0r - y4r;
			
 
				+    a[3] = y0i - y4i;
			
 
				+    a[4] = y2r - y6i;
			
 
				+    a[5] = y2i + y6r;
			
 
				+    a[6] = y2r + y6i;
			
 
				+    a[7] = y2i - y6r;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void cftf082(double *a, double *w)
			
 
				+{
			
 
				+    double wn4r, wk1r, wk1i, x0r, x0i, x1r, x1i,
			
 
				+        y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i,
			
 
				+        y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i;
			
 
				+
			
 
				+    wn4r = w[1];
			
 
				+    wk1r = w[2];
			
 
				+    wk1i = w[3];
			
 
				+    y0r = a[0] - a[9];
			
 
				+    y0i = a[1] + a[8];
			
 
				+    y1r = a[0] + a[9];
			
 
				+    y1i = a[1] - a[8];
			
 
				+    x0r = a[4] - a[13];
			
 
				+    x0i = a[5] + a[12];
			
 
				+    y2r = wn4r * (x0r - x0i);
			
 
				+    y2i = wn4r * (x0i + x0r);
			
 
				+    x0r = a[4] + a[13];
			
 
				+    x0i = a[5] - a[12];
			
 
				+    y3r = wn4r * (x0r - x0i);
			
 
				+    y3i = wn4r * (x0i + x0r);
			
 
				+    x0r = a[2] - a[11];
			
 
				+    x0i = a[3] + a[10];
			
 
				+    y4r = wk1r * x0r - wk1i * x0i;
			
 
				+    y4i = wk1r * x0i + wk1i * x0r;
			
 
				+    x0r = a[2] + a[11];
			
 
				+    x0i = a[3] - a[10];
			
 
				+    y5r = wk1i * x0r - wk1r * x0i;
			
 
				+    y5i = wk1i * x0i + wk1r * x0r;
			
 
				+    x0r = a[6] - a[15];
			
 
				+    x0i = a[7] + a[14];
			
 
				+    y6r = wk1i * x0r - wk1r * x0i;
			
 
				+    y6i = wk1i * x0i + wk1r * x0r;
			
 
				+    x0r = a[6] + a[15];
			
 
				+    x0i = a[7] - a[14];
			
 
				+    y7r = wk1r * x0r - wk1i * x0i;
			
 
				+    y7i = wk1r * x0i + wk1i * x0r;
			
 
				+    x0r = y0r + y2r;
			
 
				+    x0i = y0i + y2i;
			
 
				+    x1r = y4r + y6r;
			
 
				+    x1i = y4i + y6i;
			
 
				+    a[0] = x0r + x1r;
			
 
				+    a[1] = x0i + x1i;
			
 
				+    a[2] = x0r - x1r;
			
 
				+    a[3] = x0i - x1i;
			
 
				+    x0r = y0r - y2r;
			
 
				+    x0i = y0i - y2i;
			
 
				+    x1r = y4r - y6r;
			
 
				+    x1i = y4i - y6i;
			
 
				+    a[4] = x0r - x1i;
			
 
				+    a[5] = x0i + x1r;
			
 
				+    a[6] = x0r + x1i;
			
 
				+    a[7] = x0i - x1r;
			
 
				+    x0r = y1r - y3i;
			
 
				+    x0i = y1i + y3r;
			
 
				+    x1r = y5r - y7r;
			
 
				+    x1i = y5i - y7i;
			
 
				+    a[8] = x0r + x1r;
			
 
				+    a[9] = x0i + x1i;
			
 
				+    a[10] = x0r - x1r;
			
 
				+    a[11] = x0i - x1i;
			
 
				+    x0r = y1r + y3i;
			
 
				+    x0i = y1i - y3r;
			
 
				+    x1r = y5r + y7r;
			
 
				+    x1i = y5i + y7i;
			
 
				+    a[12] = x0r - x1i;
			
 
				+    a[13] = x0i + x1r;
			
 
				+    a[14] = x0r + x1i;
			
 
				+    a[15] = x0i - x1r;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void cftf040(double *a)
			
 
				+{
			
 
				+    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
			
 
				+
			
 
				+    x0r = a[0] + a[4];
			
 
				+    x0i = a[1] + a[5];
			
 
				+    x1r = a[0] - a[4];
			
 
				+    x1i = a[1] - a[5];
			
 
				+    x2r = a[2] + a[6];
			
 
				+    x2i = a[3] + a[7];
			
 
				+    x3r = a[2] - a[6];
			
 
				+    x3i = a[3] - a[7];
			
 
				+    a[0] = x0r + x2r;
			
 
				+    a[1] = x0i + x2i;
			
 
				+    a[2] = x1r - x3i;
			
 
				+    a[3] = x1i + x3r;
			
 
				+    a[4] = x0r - x2r;
			
 
				+    a[5] = x0i - x2i;
			
 
				+    a[6] = x1r + x3i;
			
 
				+    a[7] = x1i - x3r;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void cftb040(double *a)
			
 
				+{
			
 
				+    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
			
 
				+
			
 
				+    x0r = a[0] + a[4];
			
 
				+    x0i = a[1] + a[5];
			
 
				+    x1r = a[0] - a[4];
			
 
				+    x1i = a[1] - a[5];
			
 
				+    x2r = a[2] + a[6];
			
 
				+    x2i = a[3] + a[7];
			
 
				+    x3r = a[2] - a[6];
			
 
				+    x3i = a[3] - a[7];
			
 
				+    a[0] = x0r + x2r;
			
 
				+    a[1] = x0i + x2i;
			
 
				+    a[2] = x1r + x3i;
			
 
				+    a[3] = x1i - x3r;
			
 
				+    a[4] = x0r - x2r;
			
 
				+    a[5] = x0i - x2i;
			
 
				+    a[6] = x1r - x3i;
			
 
				+    a[7] = x1i + x3r;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void cftx020(double *a)
			
 
				+{
			
 
				+    double x0r, x0i;
			
 
				+
			
 
				+    x0r = a[0] - a[2];
			
 
				+    x0i = a[1] - a[3];
			
 
				+    a[0] += a[2];
			
 
				+    a[1] += a[3];
			
 
				+    a[2] = x0r;
			
 
				+    a[3] = x0i;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void rftfsub(int n, double *a, int nc, double *c)
			
 
				+{
			
 
				+    int j, k, kk, ks, m;
			
 
				+    double wkr, wki, xr, xi, yr, yi;
			
 
				+
			
 
				+    m = n >> 1;
			
 
				+    ks = 2 * nc / m;
			
 
				+    kk = 0;
			
 
				+    for (j = 2; j < m; j += 2) {
			
 
				+        k = n - j;
			
 
				+        kk += ks;
			
 
				+        wkr = 0.5 - c[nc - kk];
			
 
				+        wki = c[kk];
			
 
				+        xr = a[j] - a[k];
			
 
				+        xi = a[j + 1] + a[k + 1];
			
 
				+        yr = wkr * xr - wki * xi;
			
 
				+        yi = wkr * xi + wki * xr;
			
 
				+        a[j] -= yr;
			
 
				+        a[j + 1] -= yi;
			
 
				+        a[k] += yr;
			
 
				+        a[k + 1] -= yi;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void rftbsub(int n, double *a, int nc, double *c)
			
 
				+{
			
 
				+    int j, k, kk, ks, m;
			
 
				+    double wkr, wki, xr, xi, yr, yi;
			
 
				+
			
 
				+    m = n >> 1;
			
 
				+    ks = 2 * nc / m;
			
 
				+    kk = 0;
			
 
				+    for (j = 2; j < m; j += 2) {
			
 
				+        k = n - j;
			
 
				+        kk += ks;
			
 
				+        wkr = 0.5 - c[nc - kk];
			
 
				+        wki = c[kk];
			
 
				+        xr = a[j] - a[k];
			
 
				+        xi = a[j + 1] + a[k + 1];
			
 
				+        yr = wkr * xr + wki * xi;
			
 
				+        yi = wkr * xi - wki * xr;
			
 
				+        a[j] -= yr;
			
 
				+        a[j + 1] -= yi;
			
 
				+        a[k] += yr;
			
 
				+        a[k + 1] -= yi;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+
			
--- a/ggml/examples/kaldi-native-fbank/csrc/log.cc
+++ b/ggml/examples/kaldi-native-fbank/csrc/log.cc
@@ -0,0 +1,142 @@
 
				+/**
			
 
				+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
			
 
				+ *
			
 
				+ * See LICENSE for clarification regarding multiple authors
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * Stack trace related stuff is from kaldi.
			
 
				+ * Refer to
			
 
				+ * https://github.com/kaldi-asr/kaldi/blob/master/src/base/kaldi-error.cc
			
 
				+ */
			
 
				+
			
 
				+#include "log.h"
			
 
				+
			
 
				+#ifdef KNF_HAVE_EXECINFO_H
			
 
				+#include <execinfo.h>  // To get stack trace in error messages.
			
 
				+#ifdef KNF_HAVE_CXXABI_H
			
 
				+#include <cxxabi.h>  // For name demangling.
			
 
				+// Useful to decode the stack trace, but only used if we have execinfo.h
			
 
				+#endif  // KNF_HAVE_CXXABI_H
			
 
				+#endif  // KNF_HAVE_EXECINFO_H
			
 
				+
			
 
				+#include <stdlib.h>
			
 
				+
			
 
				+#include <ctime>
			
 
				+#include <iomanip>
			
 
				+#include <string>
			
 
				+
			
 
				+namespace knf {
			
 
				+
			
 
				+std::string GetDateTimeStr() {
			
 
				+  std::ostringstream os;
			
 
				+  std::time_t t = std::time(nullptr);
			
 
				+  std::tm tm = *std::localtime(&t);
			
 
				+  os << std::put_time(&tm, "%F %T");  // yyyy-mm-dd hh:mm:ss
			
 
				+  return os.str();
			
 
				+}
			
 
				+
			
 
				+static bool LocateSymbolRange(const std::string &trace_name, std::size_t *begin,
			
 
				+                              std::size_t *end) {
			
 
				+  // Find the first '_' with leading ' ' or '('.
			
 
				+  *begin = std::string::npos;
			
 
				+  for (std::size_t i = 1; i < trace_name.size(); ++i) {
			
 
				+    if (trace_name[i] != '_') {
			
 
				+      continue;
			
 
				+    }
			
 
				+    if (trace_name[i - 1] == ' ' || trace_name[i - 1] == '(') {
			
 
				+      *begin = i;
			
 
				+      break;
			
 
				+    }
			
 
				+  }
			
 
				+  if (*begin == std::string::npos) {
			
 
				+    return false;
			
 
				+  }
			
 
				+  *end = trace_name.find_first_of(" +", *begin);
			
 
				+  return *end != std::string::npos;
			
 
				+}
			
 
				+
			
 
				+#ifdef KNF_HAVE_EXECINFO_H
			
 
				+static std::string Demangle(const std::string &trace_name) {
			
 
				+#ifndef KNF_HAVE_CXXABI_H
			
 
				+  return trace_name;
			
 
				+#else   // KNF_HAVE_CXXABI_H
			
 
				+  // Try demangle the symbol. We are trying to support the following formats
			
 
				+  // produced by different platforms:
			
 
				+  //
			
 
				+  // Linux:
			
 
				+  //   ./kaldi-error-test(_ZN5kaldi13UnitTestErrorEv+0xb) [0x804965d]
			
 
				+  //
			
 
				+  // Mac:
			
 
				+  //   0 server 0x000000010f67614d _ZNK5kaldi13MessageLogger10LogMessageEv + 813
			
 
				+  //
			
 
				+  // We want to extract the name e.g., '_ZN5kaldi13UnitTestErrorEv' and
			
 
				+  // demangle it info a readable name like kaldi::UnitTextError.
			
 
				+  std::size_t begin, end;
			
 
				+  if (!LocateSymbolRange(trace_name, &begin, &end)) {
			
 
				+    return trace_name;
			
 
				+  }
			
 
				+  std::string symbol = trace_name.substr(begin, end - begin);
			
 
				+  int status;
			
 
				+  char *demangled_name = abi::__cxa_demangle(symbol.c_str(), 0, 0, &status);
			
 
				+  if (status == 0 && demangled_name != nullptr) {
			
 
				+    symbol = demangled_name;
			
 
				+    free(demangled_name);
			
 
				+  }
			
 
				+  return trace_name.substr(0, begin) + symbol +
			
 
				+         trace_name.substr(end, std::string::npos);
			
 
				+#endif  // KNF_HAVE_CXXABI_H
			
 
				+}
			
 
				+#endif  // KNF_HAVE_EXECINFO_H
			
 
				+
			
 
				+std::string GetStackTrace() {
			
 
				+  std::string ans;
			
 
				+#ifdef KNF_HAVE_EXECINFO_H
			
 
				+  constexpr const std::size_t kMaxTraceSize = 50;
			
 
				+  constexpr const std::size_t kMaxTracePrint = 50;  // Must be even.
			
 
				+                                                    // Buffer for the trace.
			
 
				+  void *trace[kMaxTraceSize];
			
 
				+  // Get the trace.
			
 
				+  std::size_t size = backtrace(trace, kMaxTraceSize);
			
 
				+  // Get the trace symbols.
			
 
				+  char **trace_symbol = backtrace_symbols(trace, size);
			
 
				+  if (trace_symbol == nullptr) return ans;
			
 
				+
			
 
				+  // Compose a human-readable backtrace string.
			
 
				+  ans += "[ Stack-Trace: ]\n";
			
 
				+  if (size <= kMaxTracePrint) {
			
 
				+    for (std::size_t i = 0; i < size; ++i) {
			
 
				+      ans += Demangle(trace_symbol[i]) + "\n";
			
 
				+    }
			
 
				+  } else {  // Print out first+last (e.g.) 5.
			
 
				+    for (std::size_t i = 0; i < kMaxTracePrint / 2; ++i) {
			
 
				+      ans += Demangle(trace_symbol[i]) + "\n";
			
 
				+    }
			
 
				+    ans += ".\n.\n.\n";
			
 
				+    for (std::size_t i = size - kMaxTracePrint / 2; i < size; ++i) {
			
 
				+      ans += Demangle(trace_symbol[i]) + "\n";
			
 
				+    }
			
 
				+    if (size == kMaxTraceSize)
			
 
				+      ans += ".\n.\n.\n";  // Stack was too long, probably a bug.
			
 
				+  }
			
 
				+
			
 
				+  // We must free the array of pointers allocated by backtrace_symbols(),
			
 
				+  // but not the strings themselves.
			
 
				+  free(trace_symbol);
			
 
				+#endif  // KNF_HAVE_EXECINFO_H
			
 
				+  return ans;
			
 
				+}
			
 
				+
			
 
				+}  // namespace knf
			
--- a/ggml/examples/kaldi-native-fbank/csrc/log.h
+++ b/ggml/examples/kaldi-native-fbank/csrc/log.h
@@ -0,0 +1,383 @@
 
				+/**
			
 
				+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
			
 
				+ *
			
 
				+ * See LICENSE for clarification regarding multiple authors
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+// The content in this file is copied/modified from
			
 
				+// https://github.com/k2-fsa/k2/blob/master/k2/csrc/log.h
			
 
				+#ifndef KALDI_NATIVE_FBANK_CSRC_LOG_H_
			
 
				+#define KALDI_NATIVE_FBANK_CSRC_LOG_H_
			
 
				+
			
 
				+#include <stdio.h>
			
 
				+
			
 
				+#include <mutex>  // NOLINT
			
 
				+#include <sstream>
			
 
				+#include <string>
			
 
				+
			
 
				+namespace knf {
			
 
				+
			
 
				+#if KNF_ENABLE_CHECK
			
 
				+
			
 
				+#if defined(NDEBUG)
			
 
				+constexpr bool kDisableDebug = true;
			
 
				+#else
			
 
				+constexpr bool kDisableDebug = false;
			
 
				+#endif
			
 
				+
			
 
				+enum class LogLevel {
			
 
				+  kTrace = 0,
			
 
				+  kDebug = 1,
			
 
				+  kInfo = 2,
			
 
				+  kWarning = 3,
			
 
				+  kError = 4,
			
 
				+  kFatal = 5,  // print message and abort the program
			
 
				+};
			
 
				+
			
 
				+// They are used in KNF_LOG(xxx), so their names
			
 
				+// do not follow the google c++ code style
			
 
				+//
			
 
				+// You can use them in the following way:
			
 
				+//
			
 
				+//  KNF_LOG(TRACE) << "some message";
			
 
				+//  KNF_LOG(DEBUG) << "some message";
			
 
				+#ifndef _MSC_VER
			
 
				+constexpr LogLevel TRACE = LogLevel::kTrace;
			
 
				+constexpr LogLevel DEBUG = LogLevel::kDebug;
			
 
				+constexpr LogLevel INFO = LogLevel::kInfo;
			
 
				+constexpr LogLevel WARNING = LogLevel::kWarning;
			
 
				+constexpr LogLevel ERROR = LogLevel::kError;
			
 
				+constexpr LogLevel FATAL = LogLevel::kFatal;
			
 
				+#else
			
 
				+#define TRACE LogLevel::kTrace
			
 
				+#define DEBUG LogLevel::kDebug
			
 
				+#define INFO LogLevel::kInfo
			
 
				+#define WARNING LogLevel::kWarning
			
 
				+#define ERROR LogLevel::kError
			
 
				+#define FATAL LogLevel::kFatal
			
 
				+#endif
			
 
				+
			
 
				+std::string GetStackTrace();
			
 
				+
			
 
				+/* Return the current log level.
			
 
				+
			
 
				+
			
 
				+   If the current log level is TRACE, then all logged messages are printed out.
			
 
				+
			
 
				+   If the current log level is DEBUG, log messages with "TRACE" level are not
			
 
				+   shown and all other levels are printed out.
			
 
				+
			
 
				+   Similarly, if the current log level is INFO, log message with "TRACE" and
			
 
				+   "DEBUG" are not shown and all other levels are printed out.
			
 
				+
			
 
				+   If it is FATAL, then only FATAL messages are shown.
			
 
				+ */
			
 
				+inline LogLevel GetCurrentLogLevel() {
			
 
				+  static LogLevel log_level = INFO;
			
 
				+  static std::once_flag init_flag;
			
 
				+  std::call_once(init_flag, []() {
			
 
				+    const char *env_log_level = std::getenv("KNF_LOG_LEVEL");
			
 
				+    if (env_log_level == nullptr) return;
			
 
				+
			
 
				+    std::string s = env_log_level;
			
 
				+    if (s == "TRACE")
			
 
				+      log_level = TRACE;
			
 
				+    else if (s == "DEBUG")
			
 
				+      log_level = DEBUG;
			
 
				+    else if (s == "INFO")
			
 
				+      log_level = INFO;
			
 
				+    else if (s == "WARNING")
			
 
				+      log_level = WARNING;
			
 
				+    else if (s == "ERROR")
			
 
				+      log_level = ERROR;
			
 
				+    else if (s == "FATAL")
			
 
				+      log_level = FATAL;
			
 
				+    else
			
 
				+      fprintf(stderr,
			
 
				+              "Unknown KNF_LOG_LEVEL: %s"
			
 
				+              "\nSupported values are: "
			
 
				+              "TRACE, DEBUG, INFO, WARNING, ERROR, FATAL",
			
 
				+              s.c_str());
			
 
				+  });
			
 
				+  return log_level;
			
 
				+}
			
 
				+
			
 
				+inline bool EnableAbort() {
			
 
				+  static std::once_flag init_flag;
			
 
				+  static bool enable_abort = false;
			
 
				+  std::call_once(init_flag, []() {
			
 
				+    enable_abort = (std::getenv("KNF_ABORT") != nullptr);
			
 
				+  });
			
 
				+  return enable_abort;
			
 
				+}
			
 
				+
			
 
				+class Logger {
			
 
				+ public:
			
 
				+  Logger(const char *filename, const char *func_name, uint32_t line_num,
			
 
				+         LogLevel level)
			
 
				+      : filename_(filename),
			
 
				+        func_name_(func_name),
			
 
				+        line_num_(line_num),
			
 
				+        level_(level) {
			
 
				+    cur_level_ = GetCurrentLogLevel();
			
 
				+    fprintf(stderr, "here\n");
			
 
				+    switch (level) {
			
 
				+      case TRACE:
			
 
				+        if (cur_level_ <= TRACE) fprintf(stderr, "[T] ");
			
 
				+        break;
			
 
				+      case DEBUG:
			
 
				+        if (cur_level_ <= DEBUG) fprintf(stderr, "[D] ");
			
 
				+        break;
			
 
				+      case INFO:
			
 
				+        if (cur_level_ <= INFO) fprintf(stderr, "[I] ");
			
 
				+        break;
			
 
				+      case WARNING:
			
 
				+        if (cur_level_ <= WARNING) fprintf(stderr, "[W] ");
			
 
				+        break;
			
 
				+      case ERROR:
			
 
				+        if (cur_level_ <= ERROR) fprintf(stderr, "[E] ");
			
 
				+        break;
			
 
				+      case FATAL:
			
 
				+        if (cur_level_ <= FATAL) fprintf(stderr, "[F] ");
			
 
				+        break;
			
 
				+    }
			
 
				+
			
 
				+    if (cur_level_ <= level_) {
			
 
				+      fprintf(stderr, "%s:%u:%s ", filename, line_num, func_name);
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  ~Logger() noexcept(false) {
			
 
				+    static constexpr const char *kErrMsg = R"(
			
 
				+    Some bad things happened. Please read the above error messages and stack
			
 
				+    trace. If you are using Python, the following command may be helpful:
			
 
				+
			
 
				+      gdb --args python /path/to/your/code.py
			
 
				+
			
 
				+    (You can use `gdb` to debug the code. Please consider compiling
			
 
				+    a debug version of KNF.).
			
 
				+
			
 
				+    If you are unable to fix it, please open an issue at:
			
 
				+
			
 
				+      https://github.com/csukuangfj/kaldi-native-fbank/issues/new
			
 
				+    )";
			
 
				+    fprintf(stderr, "\n");
			
 
				+    if (level_ == FATAL) {
			
 
				+      std::string stack_trace = GetStackTrace();
			
 
				+      if (!stack_trace.empty()) {
			
 
				+        fprintf(stderr, "\n\n%s\n", stack_trace.c_str());
			
 
				+      }
			
 
				+
			
 
				+      fflush(nullptr);
			
 
				+
			
 
				+#ifndef __ANDROID_API__
			
 
				+      if (EnableAbort()) {
			
 
				+        // NOTE: abort() will terminate the program immediately without
			
 
				+        // printing the Python stack backtrace.
			
 
				+        abort();
			
 
				+      }
			
 
				+
			
 
				+      throw std::runtime_error(kErrMsg);
			
 
				+#else
			
 
				+      abort();
			
 
				+#endif
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  const Logger &operator<<(bool b) const {
			
 
				+    if (cur_level_ <= level_) {
			
 
				+      fprintf(stderr, b ? "true" : "false");
			
 
				+    }
			
 
				+    return *this;
			
 
				+  }
			
 
				+
			
 
				+  const Logger &operator<<(int8_t i) const {
			
 
				+    if (cur_level_ <= level_) fprintf(stderr, "%d", i);
			
 
				+    return *this;
			
 
				+  }
			
 
				+
			
 
				+  const Logger &operator<<(const char *s) const {
			
 
				+    if (cur_level_ <= level_) fprintf(stderr, "%s", s);
			
 
				+    return *this;
			
 
				+  }
			
 
				+
			
 
				+  const Logger &operator<<(int32_t i) const {
			
 
				+    if (cur_level_ <= level_) fprintf(stderr, "%d", i);
			
 
				+    return *this;
			
 
				+  }
			
 
				+
			
 
				+  const Logger &operator<<(uint32_t i) const {
			
 
				+    if (cur_level_ <= level_) fprintf(stderr, "%u", i);
			
 
				+    return *this;
			
 
				+  }
			
 
				+
			
 
				+  const Logger &operator<<(uint64_t i) const {
			
 
				+    if (cur_level_ <= level_)
			
 
				+      fprintf(stderr, "%llu", (long long unsigned int)i);  // NOLINT
			
 
				+    return *this;
			
 
				+  }
			
 
				+
			
 
				+  const Logger &operator<<(int64_t i) const {
			
 
				+    if (cur_level_ <= level_)
			
 
				+      fprintf(stderr, "%lli", (long long int)i);  // NOLINT
			
 
				+    return *this;
			
 
				+  }
			
 
				+
			
 
				+  const Logger &operator<<(float f) const {
			
 
				+    if (cur_level_ <= level_) fprintf(stderr, "%f", f);
			
 
				+    return *this;
			
 
				+  }
			
 
				+
			
 
				+  const Logger &operator<<(double d) const {
			
 
				+    if (cur_level_ <= level_) fprintf(stderr, "%f", d);
			
 
				+    return *this;
			
 
				+  }
			
 
				+
			
 
				+  template <typename T>
			
 
				+  const Logger &operator<<(const T &t) const {
			
 
				+    // require T overloads operator<<
			
 
				+    std::ostringstream os;
			
 
				+    os << t;
			
 
				+    return *this << os.str().c_str();
			
 
				+  }
			
 
				+
			
 
				+  // specialization to fix compile error: `stringstream << nullptr` is ambiguous
			
 
				+  const Logger &operator<<(const std::nullptr_t &null) const {
			
 
				+    if (cur_level_ <= level_) *this << "(null)";
			
 
				+    return *this;
			
 
				+  }
			
 
				+
			
 
				+ private:
			
 
				+  const char *filename_;
			
 
				+  const char *func_name_;
			
 
				+  uint32_t line_num_;
			
 
				+  LogLevel level_;
			
 
				+  LogLevel cur_level_;
			
 
				+};
			
 
				+#endif  // KNF_ENABLE_CHECK
			
 
				+
			
 
				+class Voidifier {
			
 
				+ public:
			
 
				+#if KNF_ENABLE_CHECK
			
 
				+  void operator&(const Logger &) const {}
			
 
				+#endif
			
 
				+};
			
 
				+#if !defined(KNF_ENABLE_CHECK)
			
 
				+template <typename T>
			
 
				+const Voidifier &operator<<(const Voidifier &v, T &&) {
			
 
				+  return v;
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+}  // namespace knf
			
 
				+
			
 
				+#define KNF_STATIC_ASSERT(x) static_assert(x, "")
			
 
				+
			
 
				+#ifdef KNF_ENABLE_CHECK
			
 
				+
			
 
				+#if defined(__clang__) || defined(__GNUC__) || defined(__GNUG__) || \
			
 
				+    defined(__PRETTY_FUNCTION__)
			
 
				+// for clang and GCC
			
 
				+#define KNF_FUNC __PRETTY_FUNCTION__
			
 
				+#else
			
 
				+// for other compilers
			
 
				+#define KNF_FUNC __func__
			
 
				+#endif
			
 
				+
			
 
				+#define KNF_CHECK(x)                                                  \
			
 
				+  (x) ? (void)0                                                       \
			
 
				+      : ::knf::Voidifier() &                                          \
			
 
				+            ::knf::Logger(__FILE__, KNF_FUNC, __LINE__, ::knf::FATAL) \
			
 
				+                << "Check failed: " << #x << " "
			
 
				+
			
 
				+// WARNING: x and y may be evaluated multiple times, but this happens only
			
 
				+// when the check fails. Since the program aborts if it fails, we don't think
			
 
				+// the extra evaluation of x and y matters.
			
 
				+//
			
 
				+// CAUTION: we recommend the following use case:
			
 
				+//
			
 
				+//      auto x = Foo();
			
 
				+//      auto y = Bar();
			
 
				+//      KNF_CHECK_EQ(x, y) << "Some message";
			
 
				+//
			
 
				+//  And please avoid
			
 
				+//
			
 
				+//      KNF_CHECK_EQ(Foo(), Bar());
			
 
				+//
			
 
				+//  if `Foo()` or `Bar()` causes some side effects, e.g., changing some
			
 
				+//  local static variables or global variables.
			
 
				+#define _KNF_CHECK_OP(x, y, op)                                              \
			
 
				+  ((x)op(y)) ? (void)0                                                       \
			
 
				+             : ::knf::Voidifier() &                                          \
			
 
				+                   ::knf::Logger(__FILE__, KNF_FUNC, __LINE__, ::knf::FATAL) \
			
 
				+                       << "Check failed: " << #x << " " << #op << " " << #y  \
			
 
				+                       << " (" << (x) << " vs. " << (y) << ") "
			
 
				+
			
 
				+#define KNF_CHECK_EQ(x, y) _KNF_CHECK_OP(x, y, ==)
			
 
				+#define KNF_CHECK_NE(x, y) _KNF_CHECK_OP(x, y, !=)
			
 
				+#define KNF_CHECK_LT(x, y) _KNF_CHECK_OP(x, y, <)
			
 
				+#define KNF_CHECK_LE(x, y) _KNF_CHECK_OP(x, y, <=)
			
 
				+#define KNF_CHECK_GT(x, y) _KNF_CHECK_OP(x, y, >)
			
 
				+#define KNF_CHECK_GE(x, y) _KNF_CHECK_OP(x, y, >=)
			
 
				+
			
 
				+#define KNF_LOG(x) ::knf::Logger(__FILE__, KNF_FUNC, __LINE__, ::knf::x)
			
 
				+
			
 
				+// ------------------------------------------------------------
			
 
				+//       For debug check
			
 
				+// ------------------------------------------------------------
			
 
				+// If you define the macro "-D NDEBUG" while compiling kaldi-native-fbank,
			
 
				+// the following macros are in fact empty and does nothing.
			
 
				+
			
 
				+#define KNF_DCHECK(x) ::knf::kDisableDebug ? (void)0 : KNF_CHECK(x)
			
 
				+
			
 
				+#define KNF_DCHECK_EQ(x, y) ::knf::kDisableDebug ? (void)0 : KNF_CHECK_EQ(x, y)
			
 
				+
			
 
				+#define KNF_DCHECK_NE(x, y) ::knf::kDisableDebug ? (void)0 : KNF_CHECK_NE(x, y)
			
 
				+
			
 
				+#define KNF_DCHECK_LT(x, y) ::knf::kDisableDebug ? (void)0 : KNF_CHECK_LT(x, y)
			
 
				+
			
 
				+#define KNF_DCHECK_LE(x, y) ::knf::kDisableDebug ? (void)0 : KNF_CHECK_LE(x, y)
			
 
				+
			
 
				+#define KNF_DCHECK_GT(x, y) ::knf::kDisableDebug ? (void)0 : KNF_CHECK_GT(x, y)
			
 
				+
			
 
				+#define KNF_DCHECK_GE(x, y) ::knf::kDisableDebug ? (void)0 : KNF_CHECK_GE(x, y)
			
 
				+
			
 
				+#define KNF_DLOG(x) \
			
 
				+  ::knf::kDisableDebug ? (void)0 : ::knf::Voidifier() & KNF_LOG(x)
			
 
				+
			
 
				+#else
			
 
				+
			
 
				+#define KNF_CHECK(x) ::knf::Voidifier()
			
 
				+#define KNF_LOG(x) ::knf::Voidifier()
			
 
				+
			
 
				+#define KNF_CHECK_EQ(x, y) ::knf::Voidifier()
			
 
				+#define KNF_CHECK_NE(x, y) ::knf::Voidifier()
			
 
				+#define KNF_CHECK_LT(x, y) ::knf::Voidifier()
			
 
				+#define KNF_CHECK_LE(x, y) ::knf::Voidifier()
			
 
				+#define KNF_CHECK_GT(x, y) ::knf::Voidifier()
			
 
				+#define KNF_CHECK_GE(x, y) ::knf::Voidifier()
			
 
				+
			
 
				+#define KNF_DCHECK(x) ::knf::Voidifier()
			
 
				+#define KNF_DLOG(x) ::knf::Voidifier()
			
 
				+#define KNF_DCHECK_EQ(x, y) ::knf::Voidifier()
			
 
				+#define KNF_DCHECK_NE(x, y) ::knf::Voidifier()
			
 
				+#define KNF_DCHECK_LT(x, y) ::knf::Voidifier()
			
 
				+#define KNF_DCHECK_LE(x, y) ::knf::Voidifier()
			
 
				+#define KNF_DCHECK_GT(x, y) ::knf::Voidifier()
			
 
				+#define KNF_DCHECK_GE(x, y) ::knf::Voidifier()
			
 
				+
			
 
				+#endif  // KNF_CHECK_NE
			
 
				+
			
 
				+#endif  // KALDI_NATIVE_FBANK_CSRC_LOG_H_
			
--- a/ggml/examples/kaldi-native-fbank/csrc/mel-computations.cc
+++ b/ggml/examples/kaldi-native-fbank/csrc/mel-computations.cc
@@ -0,0 +1,257 @@
 
				+/**
			
 
				+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
			
 
				+ *
			
 
				+ * See LICENSE for clarification regarding multiple authors
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+// This file is copied/modified from kaldi/src/feat/mel-computations.cc
			
 
				+
			
 
				+#include "mel-computations.h"
			
 
				+
			
 
				+#include <algorithm>
			
 
				+#include <sstream>
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "feature-window.h"
			
 
				+
			
 
				+namespace knf {
			
 
				+
			
 
				+std::ostream &operator<<(std::ostream &os, const MelBanksOptions &opts) {
			
 
				+  os << opts.ToString();
			
 
				+  return os;
			
 
				+}
			
 
				+
			
 
				+float MelBanks::VtlnWarpFreq(
			
 
				+    float vtln_low_cutoff,  // upper+lower frequency cutoffs for VTLN.
			
 
				+    float vtln_high_cutoff,
			
 
				+    float low_freq,  // upper+lower frequency cutoffs in mel computation
			
 
				+    float high_freq, float vtln_warp_factor, float freq) {
			
 
				+  /// This computes a VTLN warping function that is not the same as HTK's one,
			
 
				+  /// but has similar inputs (this function has the advantage of never producing
			
 
				+  /// empty bins).
			
 
				+
			
 
				+  /// This function computes a warp function F(freq), defined between low_freq
			
 
				+  /// and high_freq inclusive, with the following properties:
			
 
				+  ///  F(low_freq) == low_freq
			
 
				+  ///  F(high_freq) == high_freq
			
 
				+  /// The function is continuous and piecewise linear with two inflection
			
 
				+  ///   points.
			
 
				+  /// The lower inflection point (measured in terms of the unwarped
			
 
				+  ///  frequency) is at frequency l, determined as described below.
			
 
				+  /// The higher inflection point is at a frequency h, determined as
			
 
				+  ///   described below.
			
 
				+  /// If l <= f <= h, then F(f) = f/vtln_warp_factor.
			
 
				+  /// If the higher inflection point (measured in terms of the unwarped
			
 
				+  ///   frequency) is at h, then max(h, F(h)) == vtln_high_cutoff.
			
 
				+  ///   Since (by the last point) F(h) == h/vtln_warp_factor, then
			
 
				+  ///   max(h, h/vtln_warp_factor) == vtln_high_cutoff, so
			
 
				+  ///   h = vtln_high_cutoff / max(1, 1/vtln_warp_factor).
			
 
				+  ///     = vtln_high_cutoff * min(1, vtln_warp_factor).
			
 
				+  /// If the lower inflection point (measured in terms of the unwarped
			
 
				+  ///   frequency) is at l, then min(l, F(l)) == vtln_low_cutoff
			
 
				+  ///   This implies that l = vtln_low_cutoff / min(1, 1/vtln_warp_factor)
			
 
				+  ///                       = vtln_low_cutoff * max(1, vtln_warp_factor)
			
 
				+
			
 
				+  if (freq < low_freq || freq > high_freq)
			
 
				+    return freq;  // in case this gets called
			
 
				+  // for out-of-range frequencies, just return the freq.
			
 
				+
			
 
				+  KNF_CHECK_GT(vtln_low_cutoff, low_freq);
			
 
				+  KNF_CHECK_LT(vtln_high_cutoff, high_freq);
			
 
				+
			
 
				+  float one = 1.0f;
			
 
				+  float l = vtln_low_cutoff * std::max(one, vtln_warp_factor);
			
 
				+  float h = vtln_high_cutoff * std::min(one, vtln_warp_factor);
			
 
				+  float scale = 1.0f / vtln_warp_factor;
			
 
				+  float Fl = scale * l;  // F(l);
			
 
				+  float Fh = scale * h;  // F(h);
			
 
				+  KNF_CHECK(l > low_freq && h < high_freq);
			
 
				+  // slope of left part of the 3-piece linear function
			
 
				+  float scale_left = (Fl - low_freq) / (l - low_freq);
			
 
				+  // [slope of center part is just "scale"]
			
 
				+
			
 
				+  // slope of right part of the 3-piece linear function
			
 
				+  float scale_right = (high_freq - Fh) / (high_freq - h);
			
 
				+
			
 
				+  if (freq < l) {
			
 
				+    return low_freq + scale_left * (freq - low_freq);
			
 
				+  } else if (freq < h) {
			
 
				+    return scale * freq;
			
 
				+  } else {  // freq >= h
			
 
				+    return high_freq + scale_right * (freq - high_freq);
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+float MelBanks::VtlnWarpMelFreq(
			
 
				+    float vtln_low_cutoff,  // upper+lower frequency cutoffs for VTLN.
			
 
				+    float vtln_high_cutoff,
			
 
				+    float low_freq,  // upper+lower frequency cutoffs in mel computation
			
 
				+    float high_freq, float vtln_warp_factor, float mel_freq) {
			
 
				+  return MelScale(VtlnWarpFreq(vtln_low_cutoff, vtln_high_cutoff, low_freq,
			
 
				+                               high_freq, vtln_warp_factor,
			
 
				+                               InverseMelScale(mel_freq)));
			
 
				+}
			
 
				+
			
 
				+MelBanks::MelBanks(const MelBanksOptions &opts,
			
 
				+                   const FrameExtractionOptions &frame_opts,
			
 
				+                   float vtln_warp_factor)
			
 
				+    : htk_mode_(opts.htk_mode) {
			
 
				+  int32_t num_bins = opts.num_bins;
			
 
				+  if (num_bins < 3) KNF_LOG(FATAL) << "Must have at least 3 mel bins";
			
 
				+
			
 
				+  float sample_freq = frame_opts.samp_freq;
			
 
				+  int32_t window_length_padded = frame_opts.PaddedWindowSize();
			
 
				+  KNF_CHECK_EQ(window_length_padded % 2, 0);
			
 
				+
			
 
				+  int32_t num_fft_bins = window_length_padded / 2;
			
 
				+  float nyquist = 0.5f * sample_freq;
			
 
				+
			
 
				+  float low_freq = opts.low_freq, high_freq;
			
 
				+  if (opts.high_freq > 0.0f)
			
 
				+    high_freq = opts.high_freq;
			
 
				+  else
			
 
				+    high_freq = nyquist + opts.high_freq;
			
 
				+
			
 
				+  if (low_freq < 0.0f || low_freq >= nyquist || high_freq <= 0.0f ||
			
 
				+      high_freq > nyquist || high_freq <= low_freq) {
			
 
				+    KNF_LOG(FATAL) << "Bad values in options: low-freq " << low_freq
			
 
				+                   << " and high-freq " << high_freq << " vs. nyquist "
			
 
				+                   << nyquist;
			
 
				+  }
			
 
				+
			
 
				+  float fft_bin_width = sample_freq / window_length_padded;
			
 
				+  // fft-bin width [think of it as Nyquist-freq / half-window-length]
			
 
				+
			
 
				+  float mel_low_freq = MelScale(low_freq);
			
 
				+  float mel_high_freq = MelScale(high_freq);
			
 
				+
			
 
				+  debug_ = opts.debug_mel;
			
 
				+
			
 
				+  // divide by num_bins+1 in next line because of end-effects where the bins
			
 
				+  // spread out to the sides.
			
 
				+  float mel_freq_delta = (mel_high_freq - mel_low_freq) / (num_bins + 1);
			
 
				+
			
 
				+  float vtln_low = opts.vtln_low, vtln_high = opts.vtln_high;
			
 
				+  if (vtln_high < 0.0f) {
			
 
				+    vtln_high += nyquist;
			
 
				+  }
			
 
				+
			
 
				+  if (vtln_warp_factor != 1.0f &&
			
 
				+      (vtln_low < 0.0f || vtln_low <= low_freq || vtln_low >= high_freq ||
			
 
				+       vtln_high <= 0.0f || vtln_high >= high_freq || vtln_high <= vtln_low)) {
			
 
				+    KNF_LOG(FATAL) << "Bad values in options: vtln-low " << vtln_low
			
 
				+                   << " and vtln-high " << vtln_high << ", versus "
			
 
				+                   << "low-freq " << low_freq << " and high-freq " << high_freq;
			
 
				+  }
			
 
				+
			
 
				+  bins_.resize(num_bins);
			
 
				+  center_freqs_.resize(num_bins);
			
 
				+
			
 
				+  for (int32_t bin = 0; bin < num_bins; ++bin) {
			
 
				+    float left_mel = mel_low_freq + bin * mel_freq_delta,
			
 
				+          center_mel = mel_low_freq + (bin + 1) * mel_freq_delta,
			
 
				+          right_mel = mel_low_freq + (bin + 2) * mel_freq_delta;
			
 
				+
			
 
				+    if (vtln_warp_factor != 1.0f) {
			
 
				+      left_mel = VtlnWarpMelFreq(vtln_low, vtln_high, low_freq, high_freq,
			
 
				+                                 vtln_warp_factor, left_mel);
			
 
				+      center_mel = VtlnWarpMelFreq(vtln_low, vtln_high, low_freq, high_freq,
			
 
				+                                   vtln_warp_factor, center_mel);
			
 
				+      right_mel = VtlnWarpMelFreq(vtln_low, vtln_high, low_freq, high_freq,
			
 
				+                                  vtln_warp_factor, right_mel);
			
 
				+    }
			
 
				+    center_freqs_[bin] = InverseMelScale(center_mel);
			
 
				+
			
 
				+    // this_bin will be a vector of coefficients that is only
			
 
				+    // nonzero where this mel bin is active.
			
 
				+    std::vector<float> this_bin(num_fft_bins);
			
 
				+
			
 
				+    int32_t first_index = -1, last_index = -1;
			
 
				+    for (int32_t i = 0; i < num_fft_bins; ++i) {
			
 
				+      float freq = (fft_bin_width * i);  // Center frequency of this fft
			
 
				+                                         // bin.
			
 
				+      float mel = MelScale(freq);
			
 
				+      if (mel > left_mel && mel < right_mel) {
			
 
				+        float weight;
			
 
				+        if (mel <= center_mel)
			
 
				+          weight = (mel - left_mel) / (center_mel - left_mel);
			
 
				+        else
			
 
				+          weight = (right_mel - mel) / (right_mel - center_mel);
			
 
				+        this_bin[i] = weight;
			
 
				+        if (first_index == -1) first_index = i;
			
 
				+        last_index = i;
			
 
				+      }
			
 
				+    }
			
 
				+    KNF_CHECK(first_index != -1 && last_index >= first_index &&
			
 
				+              "You may have set num_mel_bins too large.");
			
 
				+
			
 
				+    bins_[bin].first = first_index;
			
 
				+    int32_t size = last_index + 1 - first_index;
			
 
				+    bins_[bin].second.insert(bins_[bin].second.end(),
			
 
				+                             this_bin.begin() + first_index,
			
 
				+                             this_bin.begin() + first_index + size);
			
 
				+
			
 
				+    // Replicate a bug in HTK, for testing purposes.
			
 
				+    if (opts.htk_mode && bin == 0 && mel_low_freq != 0.0f) {
			
 
				+      bins_[bin].second[0] = 0.0;
			
 
				+    }
			
 
				+  }  // for (int32_t bin = 0; bin < num_bins; ++bin) {
			
 
				+
			
 
				+  if (debug_) {
			
 
				+    std::ostringstream os;
			
 
				+    for (size_t i = 0; i < bins_.size(); i++) {
			
 
				+      os << "bin " << i << ", offset = " << bins_[i].first << ", vec = ";
			
 
				+      for (auto k : bins_[i].second) os << k << ", ";
			
 
				+      os << "\n";
			
 
				+    }
			
 
				+    KNF_LOG(INFO) << os.str();
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+// "power_spectrum" contains fft energies.
			
 
				+void MelBanks::Compute(const float *power_spectrum,
			
 
				+                       float *mel_energies_out) const {
			
 
				+  int32_t num_bins = bins_.size();
			
 
				+
			
 
				+  for (int32_t i = 0; i < num_bins; i++) {
			
 
				+    int32_t offset = bins_[i].first;
			
 
				+    const auto &v = bins_[i].second;
			
 
				+    float energy = 0;
			
 
				+    for (int32_t k = 0; k != v.size(); ++k) {
			
 
				+      energy += v[k] * power_spectrum[k + offset];
			
 
				+    }
			
 
				+
			
 
				+    // HTK-like flooring- for testing purposes (we prefer dither)
			
 
				+    if (htk_mode_ && energy < 1.0) {
			
 
				+      energy = 1.0;
			
 
				+    }
			
 
				+
			
 
				+    mel_energies_out[i] = energy;
			
 
				+
			
 
				+    // The following assert was added due to a problem with OpenBlas that
			
 
				+    // we had at one point (it was a bug in that library).  Just to detect
			
 
				+    // it early.
			
 
				+    KNF_CHECK_EQ(energy, energy);  // check that energy is not nan
			
 
				+  }
			
 
				+
			
 
				+  if (debug_) {
			
 
				+    fprintf(stderr, "MEL BANKS:\n");
			
 
				+    for (int32_t i = 0; i < num_bins; i++)
			
 
				+      fprintf(stderr, " %f", mel_energies_out[i]);
			
 
				+    fprintf(stderr, "\n");
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+}  // namespace knf
			
--- a/ggml/examples/kaldi-native-fbank/csrc/mel-computations.h
+++ b/ggml/examples/kaldi-native-fbank/csrc/mel-computations.h
@@ -0,0 +1,117 @@
 
				+/**
			
 
				+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
			
 
				+ *
			
 
				+ * See LICENSE for clarification regarding multiple authors
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+// This file is copied/modified from kaldi/src/feat/mel-computations.h
			
 
				+#ifndef KALDI_NATIVE_FBANK_CSRC_MEL_COMPUTATIONS_H_
			
 
				+#define KALDI_NATIVE_FBANK_CSRC_MEL_COMPUTATIONS_H_
			
 
				+
			
 
				+#include <cmath>
			
 
				+#include <string>
			
 
				+#include <utility>
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "feature-window.h"
			
 
				+
			
 
				+namespace knf {
			
 
				+
			
 
				+struct MelBanksOptions {
			
 
				+  int32_t num_bins = 25;  // e.g. 25; number of triangular bins
			
 
				+  float low_freq = 20;    // e.g. 20; lower frequency cutoff
			
 
				+
			
 
				+  // an upper frequency cutoff; 0 -> no cutoff, negative
			
 
				+  // ->added to the Nyquist frequency to get the cutoff.
			
 
				+  float high_freq = 0;
			
 
				+
			
 
				+  float vtln_low = 100;  // vtln lower cutoff of warping function.
			
 
				+
			
 
				+  // vtln upper cutoff of warping function: if negative, added
			
 
				+  // to the Nyquist frequency to get the cutoff.
			
 
				+  float vtln_high = -500;
			
 
				+
			
 
				+  bool debug_mel = false;
			
 
				+  // htk_mode is a "hidden" config, it does not show up on command line.
			
 
				+  // Enables more exact compatibility with HTK, for testing purposes.  Affects
			
 
				+  // mel-energy flooring and reproduces a bug in HTK.
			
 
				+  bool htk_mode = false;
			
 
				+
			
 
				+  std::string ToString() const {
			
 
				+    std::ostringstream os;
			
 
				+    os << "num_bins: " << num_bins << "\n";
			
 
				+    os << "low_freq: " << low_freq << "\n";
			
 
				+    os << "high_freq: " << high_freq << "\n";
			
 
				+    os << "vtln_low: " << vtln_low << "\n";
			
 
				+    os << "vtln_high: " << vtln_high << "\n";
			
 
				+    os << "debug_mel: " << debug_mel << "\n";
			
 
				+    os << "htk_mode: " << htk_mode << "\n";
			
 
				+    return os.str();
			
 
				+  }
			
 
				+};
			
 
				+
			
 
				+std::ostream &operator<<(std::ostream &os, const MelBanksOptions &opts);
			
 
				+
			
 
				+class MelBanks {
			
 
				+ public:
			
 
				+  static inline float InverseMelScale(float mel_freq) {
			
 
				+    return 700.0f * (expf(mel_freq / 1127.0f) - 1.0f);
			
 
				+  }
			
 
				+
			
 
				+  static inline float MelScale(float freq) {
			
 
				+    return 1127.0f * logf(1.0f + freq / 700.0f);
			
 
				+  }
			
 
				+
			
 
				+  static float VtlnWarpFreq(
			
 
				+      float vtln_low_cutoff,
			
 
				+      float vtln_high_cutoff,  // discontinuities in warp func
			
 
				+      float low_freq,
			
 
				+      float high_freq,  // upper+lower frequency cutoffs in
			
 
				+      // the mel computation
			
 
				+      float vtln_warp_factor, float freq);
			
 
				+
			
 
				+  static float VtlnWarpMelFreq(float vtln_low_cutoff, float vtln_high_cutoff,
			
 
				+                               float low_freq, float high_freq,
			
 
				+                               float vtln_warp_factor, float mel_freq);
			
 
				+
			
 
				+  // TODO(fangjun): Remove vtln_warp_factor
			
 
				+  MelBanks(const MelBanksOptions &opts,
			
 
				+           const FrameExtractionOptions &frame_opts, float vtln_warp_factor);
			
 
				+
			
 
				+  /// Compute Mel energies (note: not log energies).
			
 
				+  /// At input, "fft_energies" contains the FFT energies (not log).
			
 
				+  ///
			
 
				+  /// @param fft_energies 1-D array of size num_fft_bins/2+1
			
 
				+  /// @param mel_energies_out  1-D array of size num_mel_bins
			
 
				+  void Compute(const float *fft_energies, float *mel_energies_out) const;
			
 
				+
			
 
				+  int32_t NumBins() const { return bins_.size(); }
			
 
				+
			
 
				+ private:
			
 
				+  // center frequencies of bins, numbered from 0 ... num_bins-1.
			
 
				+  // Needed by GetCenterFreqs().
			
 
				+  std::vector<float> center_freqs_;
			
 
				+
			
 
				+  // the "bins_" vector is a vector, one for each bin, of a pair:
			
 
				+  // (the first nonzero fft-bin), (the vector of weights).
			
 
				+  std::vector<std::pair<int32_t, std::vector<float>>> bins_;
			
 
				+
			
 
				+  // TODO(fangjun): Remove debug_ and htk_mode_
			
 
				+  bool debug_;
			
 
				+  bool htk_mode_;
			
 
				+};
			
 
				+
			
 
				+}  // namespace knf
			
 
				+
			
 
				+#endif  // KALDI_NATIVE_FBANK_CSRC_MEL_COMPUTATIONS_H_
			
--- a/ggml/examples/kaldi-native-fbank/csrc/online-feature.cc
+++ b/ggml/examples/kaldi-native-fbank/csrc/online-feature.cc
@@ -0,0 +1,166 @@
 
				+/**
			
 
				+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
			
 
				+ *
			
 
				+ * See LICENSE for clarification regarding multiple authors
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+// The content in this file is copied/modified from
			
 
				+// This file is copied/modified from kaldi/src/feat/online-feature.cc
			
 
				+
			
 
				+#include "online-feature.h"
			
 
				+
			
 
				+#include <algorithm>
			
 
				+#include <utility>
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "feature-window.h"
			
 
				+#include "log.h"
			
 
				+
			
 
				+namespace knf {
			
 
				+
			
 
				+RecyclingVector::RecyclingVector(int32_t items_to_hold)
			
 
				+    : items_to_hold_(items_to_hold == 0 ? -1 : items_to_hold),
			
 
				+      first_available_index_(0) {}
			
 
				+
			
 
				+const float *RecyclingVector::At(int32_t index) const {
			
 
				+  if (index < first_available_index_) {
			
 
				+    KNF_LOG(FATAL) << "Attempted to retrieve feature vector that was "
			
 
				+                      "already removed by the RecyclingVector (index = "
			
 
				+                   << index << "; "
			
 
				+                   << "first_available_index = " << first_available_index_
			
 
				+                   << "; "
			
 
				+                   << "size = " << Size() << ")";
			
 
				+  }
			
 
				+  // 'at' does size checking.
			
 
				+  return items_.at(index - first_available_index_).data();
			
 
				+}
			
 
				+
			
 
				+void RecyclingVector::PushBack(std::vector<float> item) {
			
 
				+  // Note: -1 is a larger number when treated as unsigned
			
 
				+  if (items_.size() == static_cast<size_t>(items_to_hold_)) {
			
 
				+    items_.pop_front();
			
 
				+    ++first_available_index_;
			
 
				+  }
			
 
				+  items_.push_back(std::move(item));
			
 
				+}
			
 
				+
			
 
				+int32_t RecyclingVector::Size() const {
			
 
				+  return first_available_index_ + static_cast<int32_t>(items_.size());
			
 
				+}
			
 
				+
			
 
				+// discard the first n frames
			
 
				+void RecyclingVector::Pop(int32_t n) {
			
 
				+  for (int32_t i = 0; i < n && !items_.empty(); ++i) {
			
 
				+    items_.pop_front();
			
 
				+    ++first_available_index_;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+template <class C>
			
 
				+OnlineGenericBaseFeature<C>::OnlineGenericBaseFeature(
			
 
				+    const typename C::Options &opts)
			
 
				+    : computer_(opts),
			
 
				+      window_function_(computer_.GetFrameOptions()),
			
 
				+      input_finished_(false),
			
 
				+      waveform_offset_(0) {}
			
 
				+
			
 
				+template <class C>
			
 
				+void OnlineGenericBaseFeature<C>::AcceptWaveform(float sampling_rate,
			
 
				+                                                 const float *waveform,
			
 
				+                                                 int32_t n) {
			
 
				+  if (n == 0) {
			
 
				+    return;  // Nothing to do.
			
 
				+  }
			
 
				+
			
 
				+  if (input_finished_) {
			
 
				+    KNF_LOG(FATAL) << "AcceptWaveform called after InputFinished() was called.";
			
 
				+  }
			
 
				+
			
 
				+  KNF_CHECK_EQ(sampling_rate, computer_.GetFrameOptions().samp_freq);
			
 
				+
			
 
				+  waveform_remainder_.insert(waveform_remainder_.end(), waveform, waveform + n);
			
 
				+
			
 
				+  ComputeFeatures();
			
 
				+}
			
 
				+
			
 
				+template <class C>
			
 
				+void OnlineGenericBaseFeature<C>::InputFinished() {
			
 
				+  input_finished_ = true;
			
 
				+  ComputeFeatures();
			
 
				+}
			
 
				+
			
 
				+template <class C>
			
 
				+void OnlineGenericBaseFeature<C>::ComputeFeatures() {
			
 
				+  const FrameExtractionOptions &frame_opts = computer_.GetFrameOptions();
			
 
				+
			
 
				+  int64_t num_samples_total = waveform_offset_ + waveform_remainder_.size();
			
 
				+
			
 
				+  int32_t num_frames_old = features_.Size();
			
 
				+
			
 
				+  int32_t num_frames_new =
			
 
				+      NumFrames(num_samples_total, frame_opts, input_finished_);
			
 
				+
			
 
				+  KNF_CHECK_GE(num_frames_new, num_frames_old);
			
 
				+
			
 
				+  // note: this online feature-extraction code does not support VTLN.
			
 
				+  float vtln_warp = 1.0;
			
 
				+
			
 
				+  std::vector<float> window;
			
 
				+  bool need_raw_log_energy = computer_.NeedRawLogEnergy();
			
 
				+
			
 
				+  for (int32_t frame = num_frames_old; frame < num_frames_new; ++frame) {
			
 
				+    std::fill(window.begin(), window.end(), 0);
			
 
				+    float raw_log_energy = 0.0;
			
 
				+    ExtractWindow(waveform_offset_, waveform_remainder_.data(), waveform_remainder_.size(),
			
 
				+                  frame, frame_opts, window_function_, &window,
			
 
				+                  need_raw_log_energy ? &raw_log_energy : nullptr);
			
 
				+
			
 
				+    std::vector<float> this_feature(computer_.Dim());
			
 
				+
			
 
				+    computer_.Compute(raw_log_energy, vtln_warp, &window, this_feature.data());
			
 
				+    features_.PushBack(std::move(this_feature));
			
 
				+  }
			
 
				+
			
 
				+  // OK, we will now discard any portion of the signal that will not be
			
 
				+  // necessary to compute frames in the future.
			
 
				+  int64_t first_sample_of_next_frame =
			
 
				+      FirstSampleOfFrame(num_frames_new, frame_opts);
			
 
				+
			
 
				+  int32_t samples_to_discard = first_sample_of_next_frame - waveform_offset_;
			
 
				+
			
 
				+  if (samples_to_discard > 0) {
			
 
				+    // discard the leftmost part of the waveform that we no longer need.
			
 
				+    int32_t new_num_samples =
			
 
				+        static_cast<int32_t>(waveform_remainder_.size()) - samples_to_discard;
			
 
				+
			
 
				+    if (new_num_samples <= 0) {
			
 
				+      // odd, but we'll try to handle it.
			
 
				+      waveform_offset_ += waveform_remainder_.size();
			
 
				+      waveform_remainder_.resize(0);
			
 
				+    } else {
			
 
				+      std::vector<float> new_remainder(new_num_samples);
			
 
				+
			
 
				+      std::copy(waveform_remainder_.begin() + samples_to_discard,
			
 
				+                waveform_remainder_.end(), new_remainder.begin());
			
 
				+      waveform_offset_ += samples_to_discard;
			
 
				+
			
 
				+      waveform_remainder_.swap(new_remainder);
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+template class OnlineGenericBaseFeature<FbankComputer>;
			
 
				+
			
 
				+}  // namespace knf
			
--- a/ggml/examples/kaldi-native-fbank/csrc/online-feature.h
+++ b/ggml/examples/kaldi-native-fbank/csrc/online-feature.h
@@ -0,0 +1,148 @@
 
				+/**
			
 
				+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
			
 
				+ *
			
 
				+ * See LICENSE for clarification regarding multiple authors
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+// The content in this file is copied/modified from
			
 
				+// This file is copied/modified from kaldi/src/feat/online-feature.h
			
 
				+#ifndef KALDI_NATIVE_FBANK_CSRC_ONLINE_FEATURE_H_
			
 
				+#define KALDI_NATIVE_FBANK_CSRC_ONLINE_FEATURE_H_
			
 
				+
			
 
				+#include <cstdint>
			
 
				+#include <deque>
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "feature-fbank.h"
			
 
				+
			
 
				+namespace knf {
			
 
				+
			
 
				+/// This class serves as a storage for feature vectors with an option to limit
			
 
				+/// the memory usage by removing old elements. The deleted frames indices are
			
 
				+/// "remembered" so that regardless of the MAX_ITEMS setting, the user always
			
 
				+/// provides the indices as if no deletion was being performed.
			
 
				+/// This is useful when processing very long recordings which would otherwise
			
 
				+/// cause the memory to eventually blow up when the features are not being
			
 
				+/// removed.
			
 
				+class RecyclingVector {
			
 
				+ public:
			
 
				+  /// By default it does not remove any elements.
			
 
				+  explicit RecyclingVector(int32_t items_to_hold = -1);
			
 
				+
			
 
				+  ~RecyclingVector() = default;
			
 
				+  RecyclingVector(const RecyclingVector &) = delete;
			
 
				+  RecyclingVector &operator=(const RecyclingVector &) = delete;
			
 
				+
			
 
				+  // The pointer is owned by RecyclingVector
			
 
				+  // Users should not free it
			
 
				+  const float *At(int32_t index) const;
			
 
				+
			
 
				+  void PushBack(std::vector<float> item);
			
 
				+
			
 
				+  /// This method returns the size as if no "recycling" had happened,
			
 
				+  /// i.e. equivalent to the number of times the PushBack method has been
			
 
				+  /// called.
			
 
				+  int32_t Size() const;
			
 
				+
			
 
				+  // discard the first n frames
			
 
				+  void Pop(int32_t n);
			
 
				+
			
 
				+ private:
			
 
				+  std::deque<std::vector<float>> items_;
			
 
				+  int32_t items_to_hold_;
			
 
				+  int32_t first_available_index_;
			
 
				+};
			
 
				+
			
 
				+/// This is a templated class for online feature extraction;
			
 
				+/// it's templated on a class like MfccComputer or PlpComputer
			
 
				+/// that does the basic feature extraction.
			
 
				+template <class C>
			
 
				+class OnlineGenericBaseFeature {
			
 
				+ public:
			
 
				+  // Constructor from options class
			
 
				+  explicit OnlineGenericBaseFeature(const typename C::Options &opts);
			
 
				+
			
 
				+  int32_t Dim() const { return computer_.Dim(); }
			
 
				+
			
 
				+  float FrameShiftInSeconds() const {
			
 
				+    return computer_.GetFrameOptions().frame_shift_ms / 1000.0f;
			
 
				+  }
			
 
				+
			
 
				+  int32_t NumFramesReady() const { return features_.Size(); }
			
 
				+
			
 
				+  // Note: IsLastFrame() will only ever return true if you have called
			
 
				+  // InputFinished() (and this frame is the last frame).
			
 
				+  bool IsLastFrame(int32_t frame) const {
			
 
				+    return input_finished_ && frame == NumFramesReady() - 1;
			
 
				+  }
			
 
				+
			
 
				+  const float *GetFrame(int32_t frame) const { return features_.At(frame); }
			
 
				+
			
 
				+  // This would be called from the application, when you get
			
 
				+  // more wave data.  Note: the sampling_rate is only provided so
			
 
				+  // the code can assert that it matches the sampling rate
			
 
				+  // expected in the options.
			
 
				+  //
			
 
				+  // @param sampling_rate The sampling_rate of the input waveform
			
 
				+  // @param waveform Pointer to a 1-D array of size n
			
 
				+  // @param n Number of entries in waveform
			
 
				+  void AcceptWaveform(float sampling_rate, const float *waveform, int32_t n);
			
 
				+
			
 
				+  // InputFinished() tells the class you won't be providing any
			
 
				+  // more waveform.  This will help flush out the last frame or two
			
 
				+  // of features, in the case where snip-edges == false; it also
			
 
				+  // affects the return value of IsLastFrame().
			
 
				+  void InputFinished();
			
 
				+
			
 
				+  // discard the first n frames
			
 
				+  void Pop(int32_t n) { features_.Pop(n); }
			
 
				+
			
 
				+ private:
			
 
				+  // This function computes any additional feature frames that it is possible to
			
 
				+  // compute from 'waveform_remainder_', which at this point may contain more
			
 
				+  // than just a remainder-sized quantity (because AcceptWaveform() appends to
			
 
				+  // waveform_remainder_ before calling this function).  It adds these feature
			
 
				+  // frames to features_, and shifts off any now-unneeded samples of input from
			
 
				+  // waveform_remainder_ while incrementing waveform_offset_ by the same amount.
			
 
				+  void ComputeFeatures();
			
 
				+
			
 
				+  C computer_;  // class that does the MFCC or PLP or filterbank computation
			
 
				+
			
 
				+  FeatureWindowFunction window_function_;
			
 
				+
			
 
				+  // features_ is the Mfcc or Plp or Fbank features that we have already
			
 
				+  // computed.
			
 
				+
			
 
				+  RecyclingVector features_;
			
 
				+
			
 
				+  // True if the user has called "InputFinished()"
			
 
				+  bool input_finished_;
			
 
				+
			
 
				+  // waveform_offset_ is the number of samples of waveform that we have
			
 
				+  // already discarded, i.e. that were prior to 'waveform_remainder_'.
			
 
				+  int64_t waveform_offset_;
			
 
				+
			
 
				+  // waveform_remainder_ is a short piece of waveform that we may need to keep
			
 
				+  // after extracting all the whole frames we can (whatever length of feature
			
 
				+  // will be required for the next phase of computation).
			
 
				+  // It is a 1-D tensor
			
 
				+  std::vector<float> waveform_remainder_;
			
 
				+};
			
 
				+
			
 
				+using OnlineFbank = OnlineGenericBaseFeature<FbankComputer>;
			
 
				+
			
 
				+}  // namespace knf
			
 
				+
			
 
				+#endif  // KALDI_NATIVE_FBANK_CSRC_ONLINE_FEATURE_H_
			
--- a/ggml/examples/kaldi-native-fbank/csrc/rfft.cc
+++ b/ggml/examples/kaldi-native-fbank/csrc/rfft.cc
@@ -0,0 +1,67 @@
 
				+/**
			
 
				+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
			
 
				+ *
			
 
				+ * See LICENSE for clarification regarding multiple authors
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+#include "rfft.h"
			
 
				+
			
 
				+#include <algorithm>
			
 
				+#include <cmath>
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "log.h"
			
 
				+
			
 
				+// see fftsg.c
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" void rdft(int n, int isgn, double *a, int *ip, double *w);
			
 
				+#else
			
 
				+void rdft(int n, int isgn, double *a, int *ip, double *w);
			
 
				+#endif
			
 
				+
			
 
				+namespace knf {
			
 
				+class Rfft::RfftImpl {
			
 
				+ public:
			
 
				+  explicit RfftImpl(int32_t n) : n_(n), ip_(2 + std::sqrt(n / 2)), w_(n / 2) {
			
 
				+    KNF_CHECK_EQ(n & (n - 1), 0);
			
 
				+  }
			
 
				+
			
 
				+  void Compute(float *in_out) {
			
 
				+    std::vector<double> d(in_out, in_out + n_);
			
 
				+
			
 
				+    Compute(d.data());
			
 
				+
			
 
				+    std::copy(d.begin(), d.end(), in_out);
			
 
				+  }
			
 
				+
			
 
				+  void Compute(double *in_out) {
			
 
				+    // 1 means forward fft
			
 
				+    rdft(n_, 1, in_out, ip_.data(), w_.data());
			
 
				+  }
			
 
				+
			
 
				+ private:
			
 
				+  int32_t n_;
			
 
				+  std::vector<int32_t> ip_;
			
 
				+  std::vector<double> w_;
			
 
				+};
			
 
				+
			
 
				+Rfft::Rfft(int32_t n) : impl_(std::make_unique<RfftImpl>(n)) {}
			
 
				+
			
 
				+Rfft::~Rfft() = default;
			
 
				+
			
 
				+void Rfft::Compute(float *in_out) { impl_->Compute(in_out); }
			
 
				+void Rfft::Compute(double *in_out) { impl_->Compute(in_out); }
			
 
				+
			
 
				+}  // namespace knf
			
--- a/ggml/examples/kaldi-native-fbank/csrc/rfft.h
+++ b/ggml/examples/kaldi-native-fbank/csrc/rfft.h
@@ -0,0 +1,56 @@
 
				+/**
			
 
				+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
			
 
				+ *
			
 
				+ * See LICENSE for clarification regarding multiple authors
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+#ifndef KALDI_NATIVE_FBANK_CSRC_RFFT_H_
			
 
				+#define KALDI_NATIVE_FBANK_CSRC_RFFT_H_
			
 
				+
			
 
				+#include <memory>
			
 
				+
			
 
				+namespace knf {
			
 
				+
			
 
				+// n-point Real discrete Fourier transform
			
 
				+// where n is a power of 2. n >= 2
			
 
				+//
			
 
				+//  R[k] = sum_j=0^n-1 in[j]*cos(2*pi*j*k/n), 0<=k<=n/2
			
 
				+//  I[k] = sum_j=0^n-1 in[j]*sin(2*pi*j*k/n), 0<k<n/2
			
 
				+class Rfft {
			
 
				+ public:
			
 
				+  // @param n Number of fft bins. it should be a power of 2.
			
 
				+  explicit Rfft(int32_t n);
			
 
				+  ~Rfft();
			
 
				+
			
 
				+  /** @param in_out A 1-D array of size n.
			
 
				+   *             On return:
			
 
				+   *               in_out[0] = R[0]
			
 
				+   *               in_out[1] = R[n/2]
			
 
				+   *               for 1 < k < n/2,
			
 
				+   *                 in_out[2*k] = R[k]
			
 
				+   *                 in_out[2*k+1] = I[k]
			
 
				+   *
			
 
				+   */
			
 
				+  void Compute(float *in_out);
			
 
				+  void Compute(double *in_out);
			
 
				+
			
 
				+ private:
			
 
				+  class RfftImpl;
			
 
				+  std::unique_ptr<RfftImpl> impl_;
			
 
				+};
			
 
				+
			
 
				+}  // namespace knf
			
 
				+
			
 
				+#endif  // KALDI_NATIVE_FBANK_CSRC_RFFT_H_
			
--- a/ggml/examples/kaldi-native-fbank/csrc/test-log.cc
+++ b/ggml/examples/kaldi-native-fbank/csrc/test-log.cc
@@ -0,0 +1,73 @@
 
				+/**
			
 
				+ * Copyright      2022  Xiaomi Corporation (authors: Fangjun Kuang)
			
 
				+ *
			
 
				+ * See LICENSE for clarification regarding multiple authors
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+#include "gtest/gtest.h"
			
 
				+#include "log.h"
			
 
				+
			
 
				+namespace knf {
			
 
				+
			
 
				+#if KNF_ENABLE_CHECK
			
 
				+
			
 
				+TEST(Log, TestLog) {
			
 
				+  KNF_LOG(TRACE) << "this is a trace message";
			
 
				+  KNF_LOG(DEBUG) << "this is a debug message";
			
 
				+  KNF_LOG(INFO) << "this is an info message";
			
 
				+  KNF_LOG(WARNING) << "this is a warning message";
			
 
				+  KNF_LOG(ERROR) << "this is an error message";
			
 
				+
			
 
				+  ASSERT_THROW(KNF_LOG(FATAL) << "This will crash the program",
			
 
				+               std::runtime_error);
			
 
				+
			
 
				+  // For debug build
			
 
				+
			
 
				+  KNF_DLOG(TRACE) << "this is a trace message for debug build";
			
 
				+  KNF_DLOG(DEBUG) << "this is a trace message for debug build";
			
 
				+  KNF_DLOG(INFO) << "this is a trace message for debug build";
			
 
				+  KNF_DLOG(ERROR) << "this is an error message for debug build";
			
 
				+  KNF_DLOG(WARNING) << "this is a trace message for debug build";
			
 
				+
			
 
				+#if !defined(NDEBUG)
			
 
				+  ASSERT_THROW(KNF_DLOG(FATAL) << "this is a trace message for debug build",
			
 
				+               std::runtime_error);
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+TEST(Log, TestCheck) {
			
 
				+  KNF_CHECK_EQ(1, 1) << "ok";
			
 
				+  KNF_CHECK_LE(1, 3) << "ok";
			
 
				+  KNF_CHECK_LT(1, 2) << "ok";
			
 
				+  KNF_CHECK_GT(2, 1) << "ok";
			
 
				+  KNF_CHECK_GE(2, 1) << "ok";
			
 
				+
			
 
				+  ASSERT_THROW(KNF_CHECK_EQ(2, 1) << "bad things happened", std::runtime_error);
			
 
				+
			
 
				+  // for debug build
			
 
				+  KNF_DCHECK_EQ(1, 1) << "ok";
			
 
				+  KNF_DCHECK_LE(1, 3) << "ok";
			
 
				+  KNF_DCHECK_LT(1, 2) << "ok";
			
 
				+  KNF_DCHECK_GT(2, 1) << "ok";
			
 
				+  KNF_DCHECK_GE(2, 1) << "ok";
			
 
				+
			
 
				+#if !defined(NDEBUG)
			
 
				+  ASSERT_THROW(KNF_CHECK_EQ(2, 1) << "bad things happened", std::runtime_error);
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+#endif
			
 
				+
			
 
				+}  // namespace knf
			
--- a/ggml/examples/kaldi-native-fbank/csrc/test-online-fbank.cc
+++ b/ggml/examples/kaldi-native-fbank/csrc/test-online-fbank.cc
@@ -0,0 +1,48 @@
 
				+/**
			
 
				+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
			
 
				+ *
			
 
				+ * See LICENSE for clarification regarding multiple authors
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+#include <iostream>
			
 
				+
			
 
				+#include "online-feature.h"
			
 
				+
			
 
				+int main() {
			
 
				+  knf::FbankOptions opts;
			
 
				+  opts.frame_opts.dither = 0;
			
 
				+  opts.mel_opts.num_bins = 10;
			
 
				+
			
 
				+  knf::OnlineFbank fbank(opts);
			
 
				+  for (int32_t i = 0; i < 1600; ++i) {
			
 
				+    float s = (i * i - i / 2) / 32767.;
			
 
				+    fbank.AcceptWaveform(16000, &s, 1);
			
 
				+  }
			
 
				+
			
 
				+  std::ostringstream os;
			
 
				+
			
 
				+  int32_t n = fbank.NumFramesReady();
			
 
				+  for (int32_t i = 0; i != n; ++i) {
			
 
				+    const float *frame = fbank.GetFrame(i);
			
 
				+    for (int32_t k = 0; k != opts.mel_opts.num_bins; ++k) {
			
 
				+      os << frame[k] << ", ";
			
 
				+    }
			
 
				+    os << "\n";
			
 
				+  }
			
 
				+
			
 
				+  std::cout << os.str() << "\n";
			
 
				+
			
 
				+  return 0;
			
 
				+}
			
--- a/ggml/examples/kaldi-native-fbank/csrc/test-online-feature.cc
+++ b/ggml/examples/kaldi-native-fbank/csrc/test-online-feature.cc
@@ -0,0 +1,59 @@
 
				+/**
			
 
				+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
			
 
				+ *
			
 
				+ * See LICENSE for clarification regarding multiple authors
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+#include "gtest/gtest.h"
			
 
				+#include "online-feature.h"
			
 
				+namespace knf {
			
 
				+
			
 
				+// TEST(RecyclingVector, TestUnlimited) {
			
 
				+//   RecyclingVector v(-1);
			
 
				+//   constexpr int32_t N = 100;
			
 
				+//   for (int32_t i = 0; i != N; ++i) {
			
 
				+//     std::unique_ptr<float[]> p(new float[3]{i, i + 1, i + 2});
			
 
				+//     v.PushBack(std::move(p));
			
 
				+//   }
			
 
				+//   ASSERT_EQ(v.Size(), N);
			
 
				+
			
 
				+//   for (int32_t i = 0; i != N; ++i) {
			
 
				+//     const float *t = v.At(i);
			
 
				+//     for (int32_t k = 0; k != 3; ++k) {
			
 
				+//       EXPECT_EQ(t[k], (i + k));
			
 
				+//     }
			
 
				+//   }
			
 
				+// }
			
 
				+
			
 
				+// TEST(RecyclingVector, Testlimited) {
			
 
				+//   constexpr int32_t K = 3;
			
 
				+//   constexpr int32_t N = 10;
			
 
				+//   RecyclingVector v(K);
			
 
				+//   for (int32_t i = 0; i != N; ++i) {
			
 
				+//     std::unique_ptr<float[]> p(new float[3]{i, i + 1, i + 2});
			
 
				+//     v.PushBack(std::move(p));
			
 
				+//   }
			
 
				+
			
 
				+//   ASSERT_EQ(v.Size(), N);
			
 
				+
			
 
				+//   for (int32_t i = N - K; i != N; ++i) {
			
 
				+//     const float *t = v.At(i);
			
 
				+
			
 
				+//     for (int32_t k = 0; k != 3; ++k) {
			
 
				+//       EXPECT_EQ(t[k], (i + k));
			
 
				+//     }
			
 
				+//   }
			
 
				+// }
			
 
				+}  // namespace knf
			
--- a/ggml/examples/kaldi-native-fbank/csrc/test-rfft.cc
+++ b/ggml/examples/kaldi-native-fbank/csrc/test-rfft.cc
@@ -0,0 +1,52 @@
 
				+/**
			
 
				+ * Copyright      2022  Xiaomi Corporation (authors: Fangjun Kuang)
			
 
				+ *
			
 
				+ * See LICENSE for clarification regarding multiple authors
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+#include "gtest/gtest.h"
			
 
				+#include "rfft.h"
			
 
				+
			
 
				+namespace knf {
			
 
				+
			
 
				+#if 0
			
 
				+>>> import torch
			
 
				+>>> a = torch.tensor([1., -1, 3, 8, 20, 6, 0, 2])
			
 
				+>>> torch.fft.rfft(a)
			
 
				+tensor([ 39.0000+0.0000j, -28.1924-2.2929j,  18.0000+5.0000j,  -9.8076+3.7071j,
			
 
				+          9.0000+0.0000j])
			
 
				+#endif
			
 
				+
			
 
				+TEST(Rfft, TestRfft) {
			
 
				+  knf::Rfft fft(8);
			
 
				+  for (int32_t i = 0; i != 10; ++i) {
			
 
				+    std::vector<float> d = {1, -1, 3, 8, 20, 6, 0, 2};
			
 
				+    fft.Compute(d.data());
			
 
				+
			
 
				+    EXPECT_EQ(d[0], 39);
			
 
				+    EXPECT_EQ(d[1], 9);
			
 
				+
			
 
				+    EXPECT_NEAR(d[2], -28.1924, 1e-3);
			
 
				+    EXPECT_NEAR(-d[3], -2.2929, 1e-3);
			
 
				+
			
 
				+    EXPECT_NEAR(d[4], 18, 1e-3);
			
 
				+    EXPECT_NEAR(-d[5], 5, 1e-3);
			
 
				+
			
 
				+    EXPECT_NEAR(d[6], -9.8076, 1e-3);
			
 
				+    EXPECT_NEAR(-d[7], 3.7071, 1e-3);
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+}  // namespace knf
			
--- a/ggml/examples/kaldi-native-fbank/python/CMakeLists.txt
+++ b/ggml/examples/kaldi-native-fbank/python/CMakeLists.txt
@@ -0,0 +1,2 @@
 
				+add_subdirectory(csrc)
			
 
				+add_subdirectory(tests)
			
--- a/ggml/examples/kaldi-native-fbank/python/csrc/CMakeLists.txt
+++ b/ggml/examples/kaldi-native-fbank/python/csrc/CMakeLists.txt
@@ -0,0 +1,28 @@
 
				+pybind11_add_module(_kaldi_native_fbank
			
 
				+  feature-fbank.cc
			
 
				+  feature-window.cc
			
 
				+  kaldi-native-fbank.cc
			
 
				+  mel-computations.cc
			
 
				+  online-feature.cc
			
 
				+  utils.cc
			
 
				+)
			
 
				+
			
 
				+if(APPLE)
			
 
				+  execute_process(
			
 
				+    COMMAND "${PYTHON_EXECUTABLE}" -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())"
			
 
				+    OUTPUT_STRIP_TRAILING_WHITESPACE
			
 
				+    OUTPUT_VARIABLE PYTHON_SITE_PACKAGE_DIR
			
 
				+  )
			
 
				+  message(STATUS "PYTHON_SITE_PACKAGE_DIR: ${PYTHON_SITE_PACKAGE_DIR}")
			
 
				+  target_link_libraries(_kaldi_native_fbank PRIVATE "-Wl,-rpath,${PYTHON_SITE_PACKAGE_DIR}")
			
 
				+endif()
			
 
				+
			
 
				+if(NOT WIN32)
			
 
				+  target_link_libraries(_kaldi_native_fbank PRIVATE "-Wl,-rpath,${kaldi_native_fbank_rpath_origin}/kaldi_native_fbank/lib")
			
 
				+endif()
			
 
				+
			
 
				+target_link_libraries(_kaldi_native_fbank PRIVATE kaldi-native-fbank-core)
			
 
				+
			
 
				+install(TARGETS _kaldi_native_fbank
			
 
				+  DESTINATION ../
			
 
				+)
			
--- a/ggml/examples/kaldi-native-fbank/python/csrc/feature-fbank.cc
+++ b/ggml/examples/kaldi-native-fbank/python/csrc/feature-fbank.cc
@@ -0,0 +1,57 @@
 
				+/**
			
 
				+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
			
 
				+ *
			
 
				+ * See LICENSE for clarification regarding multiple authors
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+#include "kaldi-native-fbank/python/csrc/feature-fbank.h"
			
 
				+
			
 
				+#include <memory>
			
 
				+#include <string>
			
 
				+
			
 
				+#include "feature-fbank.h"
			
 
				+#include "kaldi-native-fbank/python/csrc/utils.h"
			
 
				+
			
 
				+namespace knf {
			
 
				+
			
 
				+static void PybindFbankOptions(py::module &m) {  // NOLINT
			
 
				+  using PyClass = FbankOptions;
			
 
				+  py::class_<PyClass>(m, "FbankOptions")
			
 
				+      .def(py::init<>())
			
 
				+      .def_readwrite("frame_opts", &PyClass::frame_opts)
			
 
				+      .def_readwrite("mel_opts", &PyClass::mel_opts)
			
 
				+      .def_readwrite("use_energy", &PyClass::use_energy)
			
 
				+      .def_readwrite("energy_floor", &PyClass::energy_floor)
			
 
				+      .def_readwrite("raw_energy", &PyClass::raw_energy)
			
 
				+      .def_readwrite("htk_compat", &PyClass::htk_compat)
			
 
				+      .def_readwrite("use_log_fbank", &PyClass::use_log_fbank)
			
 
				+      .def_readwrite("use_power", &PyClass::use_power)
			
 
				+      .def("__str__",
			
 
				+           [](const PyClass &self) -> std::string { return self.ToString(); })
			
 
				+      .def("as_dict",
			
 
				+           [](const PyClass &self) -> py::dict { return AsDict(self); })
			
 
				+      .def_static(
			
 
				+          "from_dict",
			
 
				+          [](py::dict dict) -> PyClass { return FbankOptionsFromDict(dict); })
			
 
				+      .def(py::pickle(
			
 
				+          [](const PyClass &self) -> py::dict { return AsDict(self); },
			
 
				+          [](py::dict dict) -> PyClass { return FbankOptionsFromDict(dict); }));
			
 
				+}
			
 
				+
			
 
				+void PybindFeatureFbank(py::module &m) {  // NOLINT
			
 
				+  PybindFbankOptions(m);
			
 
				+}
			
 
				+
			
 
				+}  // namespace knf
			
--- a/ggml/examples/kaldi-native-fbank/python/csrc/feature-fbank.h
+++ b/ggml/examples/kaldi-native-fbank/python/csrc/feature-fbank.h
@@ -0,0 +1,30 @@
 
				+/**
			
 
				+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
			
 
				+ *
			
 
				+ * See LICENSE for clarification regarding multiple authors
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+#ifndef KALDI_NATIVE_FBANK_PYTHON_CSRC_FEATURE_FBANK_H_
			
 
				+#define KALDI_NATIVE_FBANK_PYTHON_CSRC_FEATURE_FBANK_H_
			
 
				+
			
 
				+#include "kaldi-native-fbank/python/csrc/kaldi-native-fbank.h"
			
 
				+
			
 
				+namespace knf {
			
 
				+
			
 
				+void PybindFeatureFbank(py::module &m);  // NOLINT
			
 
				+
			
 
				+}  // namespace knf
			
 
				+
			
 
				+#endif  // KALDI_NATIVE_FBANK_PYTHON_CSRC_FEATURE_FBANK_H_
			
--- a/ggml/examples/kaldi-native-fbank/python/csrc/feature-window.cc
+++ b/ggml/examples/kaldi-native-fbank/python/csrc/feature-window.cc
@@ -0,0 +1,66 @@
 
				+/**
			
 
				+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
			
 
				+ *
			
 
				+ * See LICENSE for clarification regarding multiple authors
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+#include "kaldi-native-fbank/python/csrc/feature-window.h"
			
 
				+
			
 
				+#include <string>
			
 
				+
			
 
				+#include "feature-window.h"
			
 
				+#include "kaldi-native-fbank/python/csrc/utils.h"
			
 
				+
			
 
				+namespace knf {
			
 
				+
			
 
				+static void PybindFrameExtractionOptions(py::module &m) {  // NOLINT
			
 
				+  using PyClass = FrameExtractionOptions;
			
 
				+  py::class_<PyClass>(m, "FrameExtractionOptions")
			
 
				+      .def(py::init<>())
			
 
				+      .def_readwrite("samp_freq", &PyClass::samp_freq)
			
 
				+      .def_readwrite("frame_shift_ms", &PyClass::frame_shift_ms)
			
 
				+      .def_readwrite("frame_length_ms", &PyClass::frame_length_ms)
			
 
				+      .def_readwrite("dither", &PyClass::dither)
			
 
				+      .def_readwrite("preemph_coeff", &PyClass::preemph_coeff)
			
 
				+      .def_readwrite("remove_dc_offset", &PyClass::remove_dc_offset)
			
 
				+      .def_readwrite("window_type", &PyClass::window_type)
			
 
				+      .def_readwrite("round_to_power_of_two", &PyClass::round_to_power_of_two)
			
 
				+      .def_readwrite("blackman_coeff", &PyClass::blackman_coeff)
			
 
				+      .def_readwrite("snip_edges", &PyClass::snip_edges)
			
 
				+      .def("as_dict",
			
 
				+           [](const PyClass &self) -> py::dict { return AsDict(self); })
			
 
				+      .def_static("from_dict",
			
 
				+                  [](py::dict dict) -> PyClass {
			
 
				+                    return FrameExtractionOptionsFromDict(dict);
			
 
				+                  })
			
 
				+#if 0
			
 
				+      .def_readwrite("allow_downsample",
			
 
				+                     &PyClass::allow_downsample)
			
 
				+      .def_readwrite("allow_upsample", &PyClass::allow_upsample)
			
 
				+#endif
			
 
				+      .def("__str__",
			
 
				+           [](const PyClass &self) -> std::string { return self.ToString(); })
			
 
				+      .def(py::pickle(
			
 
				+          [](const PyClass &self) -> py::dict { return AsDict(self); },
			
 
				+          [](py::dict dict) -> PyClass {
			
 
				+            return FrameExtractionOptionsFromDict(dict);
			
 
				+          }));
			
 
				+}
			
 
				+
			
 
				+void PybindFeatureWindow(py::module &m) {  // NOLINT
			
 
				+  PybindFrameExtractionOptions(m);
			
 
				+}
			
 
				+
			
 
				+}  // namespace knf
			
--- a/ggml/examples/kaldi-native-fbank/python/csrc/feature-window.h
+++ b/ggml/examples/kaldi-native-fbank/python/csrc/feature-window.h
@@ -0,0 +1,30 @@
 
				+/**
			
 
				+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
			
 
				+ *
			
 
				+ * See LICENSE for clarification regarding multiple authors
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+#ifndef KALDI_NATIVE_FBANK_PYTHON_CSRC_FEATURE_WINDOW_H_
			
 
				+#define KALDI_NATIVE_FBANK_PYTHON_CSRC_FEATURE_WINDOW_H_
			
 
				+
			
 
				+#include "kaldi-native-fbank/python/csrc/kaldi-native-fbank.h"
			
 
				+
			
 
				+namespace knf {
			
 
				+
			
 
				+void PybindFeatureWindow(py::module &m);  // NOLINT
			
 
				+
			
 
				+}  // namespace knf
			
 
				+
			
 
				+#endif  // KALDI_NATIVE_FBANK_PYTHON_CSRC_FEATURE_WINDOW_H_
			
--- a/ggml/examples/kaldi-native-fbank/python/csrc/kaldi-native-fbank.cc
+++ b/ggml/examples/kaldi-native-fbank/python/csrc/kaldi-native-fbank.cc
@@ -0,0 +1,37 @@
 
				+/**
			
 
				+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
			
 
				+ *
			
 
				+ * See LICENSE for clarification regarding multiple authors
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+#include "kaldi-native-fbank/python/csrc/kaldi-native-fbank.h"
			
 
				+
			
 
				+#include "kaldi-native-fbank/python/csrc/feature-fbank.h"
			
 
				+#include "kaldi-native-fbank/python/csrc/feature-window.h"
			
 
				+#include "kaldi-native-fbank/python/csrc/mel-computations.h"
			
 
				+#include "kaldi-native-fbank/python/csrc/online-feature.h"
			
 
				+
			
 
				+namespace knf {
			
 
				+
			
 
				+PYBIND11_MODULE(_kaldi_native_fbank, m) {
			
 
				+  m.doc() = "Python wrapper for kaldi native fbank";
			
 
				+  PybindFeatureWindow(m);
			
 
				+  PybindMelComputations(m);
			
 
				+  PybindFeatureFbank(m);
			
 
				+
			
 
				+  PybindOnlineFeature(m);
			
 
				+}
			
 
				+
			
 
				+}  // namespace knf
			
--- a/ggml/examples/kaldi-native-fbank/python/csrc/kaldi-native-fbank.h
+++ b/ggml/examples/kaldi-native-fbank/python/csrc/kaldi-native-fbank.h
@@ -0,0 +1,27 @@
 
				+/**
			
 
				+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
			
 
				+ *
			
 
				+ * See LICENSE for clarification regarding multiple authors
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+#ifndef KALDI_NATIVE_FBANK_PYTHON_CSRC_KALDI_NATIVE_FBANK_H_
			
 
				+#define KALDI_NATIVE_FBANK_PYTHON_CSRC_KALDI_NATIVE_FBANK_H_
			
 
				+
			
 
				+#include "pybind11/numpy.h"
			
 
				+#include "pybind11/pybind11.h"
			
 
				+#include "pybind11/stl.h"
			
 
				+namespace py = pybind11;
			
 
				+
			
 
				+#endif  // KALDI_NATIVE_FBANK_PYTHON_CSRC_KALDI_NATIVE_FBANK_H_
			
--- a/ggml/examples/kaldi-native-fbank/python/csrc/mel-computations.cc
+++ b/ggml/examples/kaldi-native-fbank/python/csrc/mel-computations.cc
@@ -0,0 +1,58 @@
 
				+/**
			
 
				+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
			
 
				+ *
			
 
				+ * See LICENSE for clarification regarding multiple authors
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+#include "kaldi-native-fbank/python/csrc/mel-computations.h"
			
 
				+
			
 
				+#include <string>
			
 
				+
			
 
				+#include "mel-computations.h"
			
 
				+#include "kaldi-native-fbank/python/csrc/utils.h"
			
 
				+
			
 
				+namespace knf {
			
 
				+
			
 
				+static void PybindMelBanksOptions(py::module &m) {  // NOLINT
			
 
				+  using PyClass = MelBanksOptions;
			
 
				+  py::class_<PyClass>(m, "MelBanksOptions")
			
 
				+      .def(py::init<>())
			
 
				+      .def_readwrite("num_bins", &PyClass::num_bins)
			
 
				+      .def_readwrite("low_freq", &PyClass::low_freq)
			
 
				+      .def_readwrite("high_freq", &PyClass::high_freq)
			
 
				+      .def_readwrite("vtln_low", &PyClass::vtln_low)
			
 
				+      .def_readwrite("vtln_high", &PyClass::vtln_high)
			
 
				+      .def_readwrite("debug_mel", &PyClass::debug_mel)
			
 
				+      .def_readwrite("htk_mode", &PyClass::htk_mode)
			
 
				+      .def("__str__",
			
 
				+           [](const PyClass &self) -> std::string { return self.ToString(); })
			
 
				+      .def("as_dict",
			
 
				+           [](const PyClass &self) -> py::dict { return AsDict(self); })
			
 
				+      .def_static("from_dict",
			
 
				+                  [](py::dict dict) -> PyClass {
			
 
				+                    return MelBanksOptionsFromDict(dict);
			
 
				+                  })
			
 
				+      .def(py::pickle(
			
 
				+          [](const PyClass &self) -> py::dict { return AsDict(self); },
			
 
				+          [](py::dict dict) -> PyClass {
			
 
				+            return MelBanksOptionsFromDict(dict);
			
 
				+          }));
			
 
				+}
			
 
				+
			
 
				+void PybindMelComputations(py::module &m) {  // NOLINT
			
 
				+  PybindMelBanksOptions(m);
			
 
				+}
			
 
				+
			
 
				+}  // namespace knf
			
--- a/ggml/examples/kaldi-native-fbank/python/csrc/mel-computations.h
+++ b/ggml/examples/kaldi-native-fbank/python/csrc/mel-computations.h
@@ -0,0 +1,30 @@
 
				+/**
			
 
				+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
			
 
				+ *
			
 
				+ * See LICENSE for clarification regarding multiple authors
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+#ifndef KALDI_NATIVE_FBANK_PYTHON_CSRC_MEL_COMPUTATIONS_H_
			
 
				+#define KALDI_NATIVE_FBANK_PYTHON_CSRC_MEL_COMPUTATIONS_H_
			
 
				+
			
 
				+#include "kaldi-native-fbank/python/csrc/kaldi-native-fbank.h"
			
 
				+
			
 
				+namespace knf {
			
 
				+
			
 
				+void PybindMelComputations(py::module &m);  // NOLINT
			
 
				+
			
 
				+}  // namespace knf
			
 
				+
			
 
				+#endif  // KALDI_NATIVE_FBANK_PYTHON_CSRC_MEL_COMPUTATIONS_H_
			
--- a/ggml/examples/kaldi-native-fbank/python/csrc/online-feature.cc
+++ b/ggml/examples/kaldi-native-fbank/python/csrc/online-feature.cc
@@ -0,0 +1,68 @@
 
				+/**
			
 
				+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
			
 
				+ *
			
 
				+ * See LICENSE for clarification regarding multiple authors
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+#include "kaldi-native-fbank/python/csrc/online-feature.h"
			
 
				+
			
 
				+#include <string>
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "online-feature.h"
			
 
				+namespace knf {
			
 
				+
			
 
				+template <typename C>
			
 
				+void PybindOnlineFeatureTpl(py::module &m,  // NOLINT
			
 
				+                            const std::string &class_name,
			
 
				+                            const std::string &class_help_doc = "") {
			
 
				+  using PyClass = OnlineGenericBaseFeature<C>;
			
 
				+  using Options = typename C::Options;
			
 
				+  py::class_<PyClass>(m, class_name.c_str(), class_help_doc.c_str())
			
 
				+      .def(py::init<const Options &>(), py::arg("opts"))
			
 
				+      .def_property_readonly("dim", &PyClass::Dim)
			
 
				+      .def_property_readonly("frame_shift_in_seconds",
			
 
				+                             &PyClass::FrameShiftInSeconds)
			
 
				+      .def_property_readonly("num_frames_ready", &PyClass::NumFramesReady)
			
 
				+      .def("is_last_frame", &PyClass::IsLastFrame, py::arg("frame"))
			
 
				+      .def(
			
 
				+          "get_frame",
			
 
				+          [](py::object obj, int32_t frame) {
			
 
				+            auto *self = obj.cast<PyClass *>();
			
 
				+            const float *f = self->GetFrame(frame);
			
 
				+            return py::array_t<float>({self->Dim()},    // shape
			
 
				+                                      {sizeof(float)},  // stride in bytes
			
 
				+                                      f,                // ptr
			
 
				+                                      obj);  // it will increase the reference
			
 
				+                                             // count of **this** vector
			
 
				+          },
			
 
				+          py::arg("frame"))
			
 
				+      .def(
			
 
				+          "accept_waveform",
			
 
				+          [](PyClass &self, float sampling_rate,
			
 
				+             const std::vector<float> &waveform) {
			
 
				+            self.AcceptWaveform(sampling_rate, waveform.data(),
			
 
				+                                waveform.size());
			
 
				+          },
			
 
				+          py::arg("sampling_rate"), py::arg("waveform"),
			
 
				+          py::call_guard<py::gil_scoped_release>())
			
 
				+      .def("input_finished", &PyClass::InputFinished);
			
 
				+}
			
 
				+
			
 
				+void PybindOnlineFeature(py::module &m) {  // NOLINT
			
 
				+  PybindOnlineFeatureTpl<FbankComputer>(m, "OnlineFbank");
			
 
				+}
			
 
				+
			
 
				+}  // namespace knf
			
--- a/ggml/examples/kaldi-native-fbank/python/csrc/online-feature.h
+++ b/ggml/examples/kaldi-native-fbank/python/csrc/online-feature.h
@@ -0,0 +1,30 @@
 
				+/**
			
 
				+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
			
 
				+ *
			
 
				+ * See LICENSE for clarification regarding multiple authors
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+#ifndef KALDI_NATIVE_FBANK_PYTHON_CSRC_ONLINE_FEATURE_H_
			
 
				+#define KALDI_NATIVE_FBANK_PYTHON_CSRC_ONLINE_FEATURE_H_
			
 
				+
			
 
				+#include "kaldi-native-fbank/python/csrc/kaldi-native-fbank.h"
			
 
				+
			
 
				+namespace knf {
			
 
				+
			
 
				+void PybindOnlineFeature(py::module &m);  // NOLINT
			
 
				+
			
 
				+}  // namespace knf
			
 
				+
			
 
				+#endif  // KALDI_NATIVE_FBANK_PYTHON_CSRC_ONLINE_FEATURE_H_
			
--- a/ggml/examples/kaldi-native-fbank/python/csrc/utils.cc
+++ b/ggml/examples/kaldi-native-fbank/python/csrc/utils.cc
@@ -0,0 +1,134 @@
 
				+/**
			
 
				+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
			
 
				+ *
			
 
				+ * See LICENSE for clarification regarding multiple authors
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+#include "kaldi-native-fbank/python/csrc/utils.h"
			
 
				+
			
 
				+#include <string>
			
 
				+
			
 
				+#include "feature-window.h"
			
 
				+
			
 
				+#define FROM_DICT(type, key)         \
			
 
				+  if (dict.contains(#key)) {         \
			
 
				+    opts.key = py::type(dict[#key]); \
			
 
				+  }
			
 
				+
			
 
				+#define AS_DICT(key) dict[#key] = opts.key
			
 
				+
			
 
				+namespace knf {
			
 
				+
			
 
				+FrameExtractionOptions FrameExtractionOptionsFromDict(py::dict dict) {
			
 
				+  FrameExtractionOptions opts;
			
 
				+
			
 
				+  FROM_DICT(float_, samp_freq);
			
 
				+  FROM_DICT(float_, frame_shift_ms);
			
 
				+  FROM_DICT(float_, frame_length_ms);
			
 
				+  FROM_DICT(float_, dither);
			
 
				+  FROM_DICT(float_, preemph_coeff);
			
 
				+  FROM_DICT(bool_, remove_dc_offset);
			
 
				+  FROM_DICT(str, window_type);
			
 
				+  FROM_DICT(bool_, round_to_power_of_two);
			
 
				+  FROM_DICT(float_, blackman_coeff);
			
 
				+  FROM_DICT(bool_, snip_edges);
			
 
				+
			
 
				+  return opts;
			
 
				+}
			
 
				+
			
 
				+py::dict AsDict(const FrameExtractionOptions &opts) {
			
 
				+  py::dict dict;
			
 
				+
			
 
				+  AS_DICT(samp_freq);
			
 
				+  AS_DICT(frame_shift_ms);
			
 
				+  AS_DICT(frame_length_ms);
			
 
				+  AS_DICT(dither);
			
 
				+  AS_DICT(preemph_coeff);
			
 
				+  AS_DICT(remove_dc_offset);
			
 
				+  AS_DICT(window_type);
			
 
				+  AS_DICT(round_to_power_of_two);
			
 
				+  AS_DICT(blackman_coeff);
			
 
				+  AS_DICT(snip_edges);
			
 
				+
			
 
				+  return dict;
			
 
				+}
			
 
				+
			
 
				+MelBanksOptions MelBanksOptionsFromDict(py::dict dict) {
			
 
				+  MelBanksOptions opts;
			
 
				+
			
 
				+  FROM_DICT(int_, num_bins);
			
 
				+  FROM_DICT(float_, low_freq);
			
 
				+  FROM_DICT(float_, high_freq);
			
 
				+  FROM_DICT(float_, vtln_low);
			
 
				+  FROM_DICT(float_, vtln_high);
			
 
				+  FROM_DICT(bool_, debug_mel);
			
 
				+  FROM_DICT(bool_, htk_mode);
			
 
				+
			
 
				+  return opts;
			
 
				+}
			
 
				+py::dict AsDict(const MelBanksOptions &opts) {
			
 
				+  py::dict dict;
			
 
				+
			
 
				+  AS_DICT(num_bins);
			
 
				+  AS_DICT(low_freq);
			
 
				+  AS_DICT(high_freq);
			
 
				+  AS_DICT(vtln_low);
			
 
				+  AS_DICT(vtln_high);
			
 
				+  AS_DICT(debug_mel);
			
 
				+  AS_DICT(htk_mode);
			
 
				+
			
 
				+  return dict;
			
 
				+}
			
 
				+
			
 
				+FbankOptions FbankOptionsFromDict(py::dict dict) {
			
 
				+  FbankOptions opts;
			
 
				+
			
 
				+  if (dict.contains("frame_opts")) {
			
 
				+    opts.frame_opts = FrameExtractionOptionsFromDict(dict["frame_opts"]);
			
 
				+  }
			
 
				+
			
 
				+  if (dict.contains("mel_opts")) {
			
 
				+    opts.mel_opts = MelBanksOptionsFromDict(dict["mel_opts"]);
			
 
				+  }
			
 
				+
			
 
				+  FROM_DICT(bool_, use_energy);
			
 
				+  FROM_DICT(float_, energy_floor);
			
 
				+  FROM_DICT(bool_, raw_energy);
			
 
				+  FROM_DICT(bool_, htk_compat);
			
 
				+  FROM_DICT(bool_, use_log_fbank);
			
 
				+  FROM_DICT(bool_, use_power);
			
 
				+
			
 
				+  return opts;
			
 
				+}
			
 
				+
			
 
				+py::dict AsDict(const FbankOptions &opts) {
			
 
				+  py::dict dict;
			
 
				+
			
 
				+  dict["frame_opts"] = AsDict(opts.frame_opts);
			
 
				+  dict["mel_opts"] = AsDict(opts.mel_opts);
			
 
				+  AS_DICT(use_energy);
			
 
				+  AS_DICT(energy_floor);
			
 
				+  AS_DICT(raw_energy);
			
 
				+  AS_DICT(htk_compat);
			
 
				+  AS_DICT(use_log_fbank);
			
 
				+  AS_DICT(use_power);
			
 
				+
			
 
				+  return dict;
			
 
				+}
			
 
				+
			
 
				+#undef FROM_DICT
			
 
				+#undef AS_DICT
			
 
				+
			
 
				+}  // namespace knf
			
--- a/ggml/examples/kaldi-native-fbank/python/csrc/utils.h
+++ b/ggml/examples/kaldi-native-fbank/python/csrc/utils.h
@@ -0,0 +1,52 @@
 
				+/**
			
 
				+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
			
 
				+ *
			
 
				+ * See LICENSE for clarification regarding multiple authors
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+#ifndef KALDI_NATIVE_FBANK_PYTHON_CSRC_UTILS_H_
			
 
				+#define KALDI_NATIVE_FBANK_PYTHON_CSRC_UTILS_H_
			
 
				+
			
 
				+#include "feature-fbank.h"
			
 
				+#include "feature-window.h"
			
 
				+#include "mel-computations.h"
			
 
				+#include "kaldi-native-fbank/python/csrc/kaldi-native-fbank.h"
			
 
				+
			
 
				+/*
			
 
				+ * This file contains code about `from_dict` and
			
 
				+ * `as_dict` for various options in kaldi-native-fbank.
			
 
				+ *
			
 
				+ * Regarding `from_dict`, users don't need to provide
			
 
				+ * all the fields in the options. If some fields
			
 
				+ * are not provided, it just uses the default one.
			
 
				+ *
			
 
				+ * If the provided dict in `from_dict` is empty,
			
 
				+ * all fields use their default values.
			
 
				+ */
			
 
				+
			
 
				+namespace knf {
			
 
				+
			
 
				+FrameExtractionOptions FrameExtractionOptionsFromDict(py::dict dict);
			
 
				+py::dict AsDict(const FrameExtractionOptions &opts);
			
 
				+
			
 
				+MelBanksOptions MelBanksOptionsFromDict(py::dict dict);
			
 
				+py::dict AsDict(const MelBanksOptions &opts);
			
 
				+
			
 
				+FbankOptions FbankOptionsFromDict(py::dict dict);
			
 
				+py::dict AsDict(const FbankOptions &opts);
			
 
				+
			
 
				+}  // namespace knf
			
 
				+
			
 
				+#endif  // KALDI_NATIVE_FBANK_PYTHON_CSRC_UTILS_H_
			
--- a/ggml/examples/kaldi-native-fbank/python/kaldi_native_fbank/__init__.py
+++ b/ggml/examples/kaldi-native-fbank/python/kaldi_native_fbank/__init__.py
@@ -0,0 +1,6 @@
 
				+from _kaldi_native_fbank import (
			
 
				+    FrameExtractionOptions,
			
 
				+    MelBanksOptions,
			
 
				+    OnlineFbank,
			
 
				+    FbankOptions,
			
 
				+)
			
--- a/ggml/examples/kaldi-native-fbank/python/tests/CMakeLists.txt
+++ b/ggml/examples/kaldi-native-fbank/python/tests/CMakeLists.txt
@@ -0,0 +1,31 @@
 
				+function(kaldi_native_fbank_add_py_test source)
			
 
				+  get_filename_component(name ${source} NAME_WE)
			
 
				+  set(name "${name}_py")
			
 
				+
			
 
				+    message(STATUS "source: ${source}")
			
 
				+
			
 
				+  add_test(NAME ${name}
			
 
				+    COMMAND
			
 
				+      "${PYTHON_EXECUTABLE}"
			
 
				+      "${CMAKE_CURRENT_SOURCE_DIR}/${source}"
			
 
				+  )
			
 
				+
			
 
				+  get_filename_component(kaldi_native_fbank_path ${CMAKE_CURRENT_LIST_DIR} DIRECTORY)
			
 
				+
			
 
				+  set_property(TEST ${name}
			
 
				+    PROPERTY ENVIRONMENT "PYTHONPATH=${kaldi_native_fbank_path}:$<TARGET_FILE_DIR:_kaldi_native_fbank>:$ENV{PYTHONPATH}"
			
 
				+  )
			
 
				+endfunction()
			
 
				+
			
 
				+# please sort the files in alphabetic order
			
 
				+set(py_test_files
			
 
				+  test_frame_extraction_options.py
			
 
				+  test_mel_bank_options.py
			
 
				+  test_fbank_options.py
			
 
				+)
			
 
				+
			
 
				+if(KALDI_NATIVE_FBANK_BUILD_TESTS)
			
 
				+  foreach(source IN LISTS py_test_files)
			
 
				+    kaldi_native_fbank_add_py_test(${source})
			
 
				+  endforeach()
			
 
				+endif()
			
--- a/ggml/examples/kaldi-native-fbank/python/tests/test_fbank_options.py
+++ b/ggml/examples/kaldi-native-fbank/python/tests/test_fbank_options.py
@@ -0,0 +1,198 @@
 
				+#!/usr/bin/env python3
			
 
				+#
			
 
				+# Copyright (c)  2021  Xiaomi Corporation (authors: Fangjun Kuang)
			
 
				+
			
 
				+
			
 
				+import pickle
			
 
				+
			
 
				+import kaldi_native_fbank as knf
			
 
				+
			
 
				+
			
 
				+def test_default():
			
 
				+    opts = knf.FbankOptions()
			
 
				+    assert opts.frame_opts.samp_freq == 16000
			
 
				+    assert opts.frame_opts.frame_shift_ms == 10.0
			
 
				+    assert opts.frame_opts.frame_length_ms == 25.0
			
 
				+    assert opts.frame_opts.dither == 1.0
			
 
				+    assert abs(opts.frame_opts.preemph_coeff - 0.97) < 1e-6
			
 
				+    assert opts.frame_opts.remove_dc_offset is True
			
 
				+    assert opts.frame_opts.window_type == "povey"
			
 
				+    assert opts.frame_opts.round_to_power_of_two is True
			
 
				+    assert abs(opts.frame_opts.blackman_coeff - 0.42) < 1e-6
			
 
				+    assert opts.frame_opts.snip_edges is True
			
 
				+
			
 
				+    assert opts.mel_opts.num_bins == 23
			
 
				+    assert opts.mel_opts.low_freq == 20
			
 
				+    assert opts.mel_opts.high_freq == 0
			
 
				+    assert opts.mel_opts.vtln_low == 100
			
 
				+    assert opts.mel_opts.vtln_high == -500
			
 
				+    assert opts.mel_opts.debug_mel is False
			
 
				+    assert opts.mel_opts.htk_mode is False
			
 
				+
			
 
				+    assert opts.use_energy is False
			
 
				+    assert opts.energy_floor == 0.0
			
 
				+    assert opts.raw_energy is True
			
 
				+    assert opts.htk_compat is False
			
 
				+    assert opts.use_log_fbank is True
			
 
				+    assert opts.use_power is True
			
 
				+
			
 
				+
			
 
				+def test_set_get():
			
 
				+    opts = knf.FbankOptions()
			
 
				+    opts.use_energy = True
			
 
				+    assert opts.use_energy is True
			
 
				+
			
 
				+    opts.energy_floor = 1
			
 
				+    assert opts.energy_floor == 1
			
 
				+
			
 
				+    opts.raw_energy = False
			
 
				+    assert opts.raw_energy is False
			
 
				+
			
 
				+    opts.htk_compat = True
			
 
				+    assert opts.htk_compat is True
			
 
				+
			
 
				+    opts.use_log_fbank = False
			
 
				+    assert opts.use_log_fbank is False
			
 
				+
			
 
				+    opts.use_power = False
			
 
				+    assert opts.use_power is False
			
 
				+
			
 
				+
			
 
				+def test_set_get_frame_opts():
			
 
				+    opts = knf.FbankOptions()
			
 
				+
			
 
				+    opts.frame_opts.samp_freq = 44100
			
 
				+    assert opts.frame_opts.samp_freq == 44100
			
 
				+
			
 
				+    opts.frame_opts.frame_shift_ms = 20.5
			
 
				+    assert opts.frame_opts.frame_shift_ms == 20.5
			
 
				+
			
 
				+    opts.frame_opts.frame_length_ms = 1
			
 
				+    assert opts.frame_opts.frame_length_ms == 1
			
 
				+
			
 
				+    opts.frame_opts.dither = 0.5
			
 
				+    assert opts.frame_opts.dither == 0.5
			
 
				+
			
 
				+    opts.frame_opts.preemph_coeff = 0.25
			
 
				+    assert opts.frame_opts.preemph_coeff == 0.25
			
 
				+
			
 
				+    opts.frame_opts.remove_dc_offset = False
			
 
				+    assert opts.frame_opts.remove_dc_offset is False
			
 
				+
			
 
				+    opts.frame_opts.window_type = "hanning"
			
 
				+    assert opts.frame_opts.window_type == "hanning"
			
 
				+
			
 
				+    opts.frame_opts.round_to_power_of_two = False
			
 
				+    assert opts.frame_opts.round_to_power_of_two is False
			
 
				+
			
 
				+    opts.frame_opts.blackman_coeff = 0.25
			
 
				+    assert opts.frame_opts.blackman_coeff == 0.25
			
 
				+
			
 
				+    opts.frame_opts.snip_edges = False
			
 
				+    assert opts.frame_opts.snip_edges is False
			
 
				+
			
 
				+
			
 
				+def test_set_get_mel_opts():
			
 
				+    opts = knf.FbankOptions()
			
 
				+
			
 
				+    opts.mel_opts.num_bins = 100
			
 
				+    assert opts.mel_opts.num_bins == 100
			
 
				+
			
 
				+    opts.mel_opts.low_freq = 22
			
 
				+    assert opts.mel_opts.low_freq == 22
			
 
				+
			
 
				+    opts.mel_opts.high_freq = 1
			
 
				+    assert opts.mel_opts.high_freq == 1
			
 
				+
			
 
				+    opts.mel_opts.vtln_low = 101
			
 
				+    assert opts.mel_opts.vtln_low == 101
			
 
				+
			
 
				+    opts.mel_opts.vtln_high = -100
			
 
				+    assert opts.mel_opts.vtln_high == -100
			
 
				+
			
 
				+    opts.mel_opts.debug_mel = True
			
 
				+    assert opts.mel_opts.debug_mel is True
			
 
				+
			
 
				+    opts.mel_opts.htk_mode = True
			
 
				+    assert opts.mel_opts.htk_mode is True
			
 
				+
			
 
				+
			
 
				+def test_from_empty_dict():
			
 
				+    opts = knf.FbankOptions.from_dict({})
			
 
				+    opts2 = knf.FbankOptions()
			
 
				+
			
 
				+    assert str(opts) == str(opts2)
			
 
				+
			
 
				+
			
 
				+def test_from_dict_partial():
			
 
				+    d = {
			
 
				+        "energy_floor": 10.5,
			
 
				+        "htk_compat": True,
			
 
				+        "mel_opts": {"num_bins": 80, "vtln_low": 1},
			
 
				+        "frame_opts": {"window_type": "hanning"},
			
 
				+    }
			
 
				+    opts = knf.FbankOptions.from_dict(d)
			
 
				+    assert opts.energy_floor == 10.5
			
 
				+    assert opts.htk_compat is True
			
 
				+    assert opts.mel_opts.num_bins == 80
			
 
				+    assert opts.mel_opts.vtln_low == 1
			
 
				+    assert opts.frame_opts.window_type == "hanning"
			
 
				+
			
 
				+    mel_opts = knf.MelBanksOptions.from_dict(d["mel_opts"])
			
 
				+    assert str(opts.mel_opts) == str(mel_opts)
			
 
				+
			
 
				+
			
 
				+def test_from_dict_full_and_as_dict():
			
 
				+    opts = knf.FbankOptions()
			
 
				+    opts.htk_compat = True
			
 
				+    opts.mel_opts.num_bins = 80
			
 
				+    opts.frame_opts.samp_freq = 10
			
 
				+
			
 
				+    d = opts.as_dict()
			
 
				+    assert d["htk_compat"] is True
			
 
				+    assert d["mel_opts"]["num_bins"] == 80
			
 
				+    assert d["frame_opts"]["samp_freq"] == 10
			
 
				+
			
 
				+    mel_opts = knf.MelBanksOptions()
			
 
				+    mel_opts.num_bins = 80
			
 
				+    assert d["mel_opts"] == mel_opts.as_dict()
			
 
				+
			
 
				+    frame_opts = knf.FrameExtractionOptions()
			
 
				+    frame_opts.samp_freq = 10
			
 
				+    assert d["frame_opts"] == frame_opts.as_dict()
			
 
				+
			
 
				+    opts2 = knf.FbankOptions.from_dict(d)
			
 
				+    assert str(opts2) == str(opts)
			
 
				+
			
 
				+    d["htk_compat"] = False
			
 
				+    opts3 = knf.FbankOptions.from_dict(d)
			
 
				+    assert opts3.htk_compat is False
			
 
				+
			
 
				+
			
 
				+def test_pickle():
			
 
				+    opts = knf.FbankOptions()
			
 
				+    opts.use_energy = True
			
 
				+    opts.use_power = False
			
 
				+
			
 
				+    opts.frame_opts.samp_freq = 44100
			
 
				+    opts.mel_opts.num_bins = 100
			
 
				+
			
 
				+    data = pickle.dumps(opts)
			
 
				+
			
 
				+    opts2 = pickle.loads(data)
			
 
				+    assert str(opts) == str(opts2)
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    test_default()
			
 
				+    test_set_get()
			
 
				+    test_set_get_frame_opts()
			
 
				+    test_set_get_mel_opts()
			
 
				+    test_from_empty_dict()
			
 
				+    test_from_dict_partial()
			
 
				+    test_from_dict_full_and_as_dict()
			
 
				+    test_pickle()
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()
			
--- a/ggml/examples/kaldi-native-fbank/python/tests/test_frame_extraction_options.py
+++ b/ggml/examples/kaldi-native-fbank/python/tests/test_frame_extraction_options.py
@@ -0,0 +1,119 @@
 
				+#!/usr/bin/env python3
			
 
				+#
			
 
				+# Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
			
 
				+
			
 
				+import pickle
			
 
				+
			
 
				+import kaldi_native_fbank as knf
			
 
				+
			
 
				+
			
 
				+def test_default():
			
 
				+    opts = knf.FrameExtractionOptions()
			
 
				+    assert opts.samp_freq == 16000
			
 
				+    assert opts.frame_shift_ms == 10.0
			
 
				+    assert opts.frame_length_ms == 25.0
			
 
				+    assert opts.dither == 1.0
			
 
				+    assert abs(opts.preemph_coeff - 0.97) < 1e-6
			
 
				+    assert opts.remove_dc_offset is True
			
 
				+    assert opts.window_type == "povey"
			
 
				+    assert opts.round_to_power_of_two is True
			
 
				+    assert abs(opts.blackman_coeff - 0.42) < 1e-6
			
 
				+    assert opts.snip_edges is True
			
 
				+
			
 
				+
			
 
				+def test_set_get():
			
 
				+    opts = knf.FrameExtractionOptions()
			
 
				+    opts.samp_freq = 44100
			
 
				+    assert opts.samp_freq == 44100
			
 
				+
			
 
				+    opts.frame_shift_ms = 20.5
			
 
				+    assert opts.frame_shift_ms == 20.5
			
 
				+
			
 
				+    opts.frame_length_ms = 1
			
 
				+    assert opts.frame_length_ms == 1
			
 
				+
			
 
				+    opts.dither = 0.5
			
 
				+    assert opts.dither == 0.5
			
 
				+
			
 
				+    opts.preemph_coeff = 0.25
			
 
				+    assert opts.preemph_coeff == 0.25
			
 
				+
			
 
				+    opts.remove_dc_offset = False
			
 
				+    assert opts.remove_dc_offset is False
			
 
				+
			
 
				+    opts.window_type = "hanning"
			
 
				+    assert opts.window_type == "hanning"
			
 
				+
			
 
				+    opts.round_to_power_of_two = False
			
 
				+    assert opts.round_to_power_of_two is False
			
 
				+
			
 
				+    opts.blackman_coeff = 0.25
			
 
				+    assert opts.blackman_coeff == 0.25
			
 
				+
			
 
				+    opts.snip_edges = False
			
 
				+    assert opts.snip_edges is False
			
 
				+
			
 
				+
			
 
				+def test_from_empty_dict():
			
 
				+    opts = knf.FrameExtractionOptions.from_dict({})
			
 
				+    opts2 = knf.FrameExtractionOptions()
			
 
				+
			
 
				+    assert str(opts) == str(opts2)
			
 
				+
			
 
				+
			
 
				+def test_from_dict_partial():
			
 
				+    d = {"samp_freq": 10, "frame_shift_ms": 2}
			
 
				+
			
 
				+    opts = knf.FrameExtractionOptions.from_dict(d)
			
 
				+
			
 
				+    opts2 = knf.FrameExtractionOptions()
			
 
				+    assert str(opts) != str(opts2)
			
 
				+
			
 
				+    opts2.samp_freq = 10
			
 
				+    assert str(opts) != str(opts2)
			
 
				+
			
 
				+    opts2.frame_shift_ms = 2
			
 
				+    assert str(opts) == str(opts2)
			
 
				+
			
 
				+    opts2.frame_shift_ms = 3
			
 
				+    assert str(opts) != str(opts2)
			
 
				+
			
 
				+
			
 
				+def test_from_dict_full_and_as_dict():
			
 
				+    opts = knf.FrameExtractionOptions()
			
 
				+    opts.samp_freq = 20
			
 
				+    opts.frame_length_ms = 100
			
 
				+
			
 
				+    d = opts.as_dict()
			
 
				+    for key, value in d.items():
			
 
				+        assert value == getattr(opts, key)
			
 
				+
			
 
				+    opts2 = knf.FrameExtractionOptions.from_dict(d)
			
 
				+    assert str(opts2) == str(opts)
			
 
				+
			
 
				+    d["window_type"] = "hanning"
			
 
				+    opts3 = knf.FrameExtractionOptions.from_dict(d)
			
 
				+    assert opts3.window_type == "hanning"
			
 
				+
			
 
				+
			
 
				+def test_pickle():
			
 
				+    opts = knf.FrameExtractionOptions()
			
 
				+    opts.samp_freq = 44100
			
 
				+    opts.dither = 5.5
			
 
				+    data = pickle.dumps(opts)
			
 
				+
			
 
				+    opts2 = pickle.loads(data)
			
 
				+    assert str(opts) == str(opts2)
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    test_default()
			
 
				+    test_set_get()
			
 
				+    test_from_empty_dict()
			
 
				+    test_from_dict_partial()
			
 
				+    test_from_dict_full_and_as_dict()
			
 
				+    test_pickle()
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()
			
--- a/ggml/examples/kaldi-native-fbank/python/tests/test_mel_bank_options.py
+++ b/ggml/examples/kaldi-native-fbank/python/tests/test_mel_bank_options.py
@@ -0,0 +1,107 @@
 
				+#!/usr/bin/env python3
			
 
				+#
			
 
				+# Copyright (c)  2021  Xiaomi Corporation (authors: Fangjun Kuang)
			
 
				+
			
 
				+import pickle
			
 
				+
			
 
				+import kaldi_native_fbank as knf
			
 
				+
			
 
				+
			
 
				+def test_default():
			
 
				+    opts = knf.MelBanksOptions()
			
 
				+    assert opts.num_bins == 25
			
 
				+    assert opts.low_freq == 20
			
 
				+    assert opts.high_freq == 0
			
 
				+    assert opts.vtln_low == 100
			
 
				+    assert opts.vtln_high == -500
			
 
				+    assert opts.debug_mel is False
			
 
				+    assert opts.htk_mode is False
			
 
				+
			
 
				+
			
 
				+def test_set_get():
			
 
				+    opts = knf.MelBanksOptions()
			
 
				+    opts.num_bins = 100
			
 
				+    assert opts.num_bins == 100
			
 
				+
			
 
				+    opts.low_freq = 22
			
 
				+    assert opts.low_freq == 22
			
 
				+
			
 
				+    opts.high_freq = 1
			
 
				+    assert opts.high_freq == 1
			
 
				+
			
 
				+    opts.vtln_low = 101
			
 
				+    assert opts.vtln_low == 101
			
 
				+
			
 
				+    opts.vtln_high = -100
			
 
				+    assert opts.vtln_high == -100
			
 
				+
			
 
				+    opts.debug_mel = True
			
 
				+    assert opts.debug_mel is True
			
 
				+
			
 
				+    opts.htk_mode = True
			
 
				+    assert opts.htk_mode is True
			
 
				+
			
 
				+
			
 
				+def test_from_empty_dict():
			
 
				+    opts = knf.MelBanksOptions.from_dict({})
			
 
				+    opts2 = knf.MelBanksOptions()
			
 
				+
			
 
				+    assert str(opts) == str(opts2)
			
 
				+
			
 
				+
			
 
				+def test_from_dict_partial():
			
 
				+    d = {"num_bins": 10, "debug_mel": True}
			
 
				+
			
 
				+    opts = knf.MelBanksOptions.from_dict(d)
			
 
				+
			
 
				+    opts2 = knf.MelBanksOptions()
			
 
				+    assert str(opts) != str(opts2)
			
 
				+
			
 
				+    opts2.num_bins = 10
			
 
				+    assert str(opts) != str(opts2)
			
 
				+
			
 
				+    opts2.debug_mel = True
			
 
				+    assert str(opts) == str(opts2)
			
 
				+
			
 
				+    opts2.debug_mel = False
			
 
				+    assert str(opts) != str(opts2)
			
 
				+
			
 
				+
			
 
				+def test_from_dict_full_and_as_dict():
			
 
				+    opts = knf.MelBanksOptions()
			
 
				+    opts.num_bins = 80
			
 
				+    opts.vtln_high = 2
			
 
				+
			
 
				+    d = opts.as_dict()
			
 
				+    for key, value in d.items():
			
 
				+        assert value == getattr(opts, key)
			
 
				+
			
 
				+    opts2 = knf.MelBanksOptions.from_dict(d)
			
 
				+    assert str(opts2) == str(opts)
			
 
				+
			
 
				+    d["htk_mode"] = True
			
 
				+    opts3 = knf.MelBanksOptions.from_dict(d)
			
 
				+    assert opts3.htk_mode is True
			
 
				+
			
 
				+
			
 
				+def test_pickle():
			
 
				+    opts = knf.MelBanksOptions()
			
 
				+    opts.num_bins = 100
			
 
				+    opts.low_freq = 22
			
 
				+    data = pickle.dumps(opts)
			
 
				+
			
 
				+    opts2 = pickle.loads(data)
			
 
				+    assert str(opts) == str(opts2)
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    test_default()
			
 
				+    test_set_get()
			
 
				+    test_from_empty_dict()
			
 
				+    test_from_dict_partial()
			
 
				+    test_from_dict_full_and_as_dict()
			
 
				+    test_pickle()
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()
			
--- a/ggml/examples/kaldi-native-fbank/python/tests/test_online_fbank.py
+++ b/ggml/examples/kaldi-native-fbank/python/tests/test_online_fbank.py
@@ -0,0 +1,48 @@
 
				+#!/usr/bin/env python3
			
 
				+
			
 
				+import sys
			
 
				+
			
 
				+try:
			
 
				+    import kaldifeat
			
 
				+except:
			
 
				+    print("Please install kaldifeat first")
			
 
				+    sys.exit(0)
			
 
				+
			
 
				+import kaldi_native_fbank as knf
			
 
				+import torch
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    sampling_rate = 16000
			
 
				+    samples = torch.randn(16000 * 10)
			
 
				+
			
 
				+    opts = kaldifeat.FbankOptions()
			
 
				+    opts.frame_opts.dither = 0
			
 
				+    opts.mel_opts.num_bins = 80
			
 
				+    opts.frame_opts.snip_edges = False
			
 
				+    opts.mel_opts.debug_mel = False
			
 
				+
			
 
				+    online_fbank = kaldifeat.OnlineFbank(opts)
			
 
				+
			
 
				+    online_fbank.accept_waveform(sampling_rate, samples)
			
 
				+
			
 
				+    opts = knf.FbankOptions()
			
 
				+    opts.frame_opts.dither = 0
			
 
				+    opts.mel_opts.num_bins = 80
			
 
				+    opts.frame_opts.snip_edges = False
			
 
				+    opts.mel_opts.debug_mel = False
			
 
				+
			
 
				+    fbank = knf.OnlineFbank(opts)
			
 
				+    fbank.accept_waveform(sampling_rate, samples.tolist())
			
 
				+
			
 
				+    assert online_fbank.num_frames_ready == fbank.num_frames_ready
			
 
				+    for i in range(fbank.num_frames_ready):
			
 
				+        f1 = online_fbank.get_frame(i)
			
 
				+        f2 = torch.from_numpy(fbank.get_frame(i))
			
 
				+        assert torch.allclose(f1, f2, atol=1e-3), (i, (f1 - f2).abs().max())
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    torch.manual_seed(20220825)
			
 
				+    main()
			
 
				+    print("success")
			
--- a/ggml/examples/unity/CMakeLists.txt
+++ b/ggml/examples/unity/CMakeLists.txt
@@ -2,7 +2,7 @@
 
				 
			
 
				 add_executable(unity unity.cpp)
			
 
				 target_include_directories(unity PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
			
 
				-target_link_libraries(unity PRIVATE ggml common common-ggml)
			
 
				+target_link_libraries(unity PRIVATE ggml common common-ggml kaldi-native-fbank)
			
 
				 target_sources(unity
			
 
				     PRIVATE
			
 
				         fairseq2.cpp
			
--- a/ggml/examples/unity/fairseq2.cpp
+++ b/ggml/examples/unity/fairseq2.cpp
@@ -1,8 +1,11 @@
 
				 #include <math.h>
			
 
				+#include "kaldi-native-fbank/csrc/feature-fbank.h"
			
 
				+#include "kaldi-native-fbank/csrc/feature-window.h"
			
 
				 #include "ggml.h"
			
 
				 #include "fairseq2.h"
			
 
				 #include <unordered_map>
			
 
				 #include <algorithm>
			
 
				+#include <iostream>
			
 
				 
			
 
				 
			
 
				 /// allocate the fairseq2 model and hyperparameters
			
@@ -13,14 +16,14 @@ extern "C" fairseq2_model* fairseq2_model_alloc() {
 
				     model->arch = new std::uint64_t[16 * 1024];  // max tensors allowed
			
 
				     model->tensors_ctx = nullptr;
			
 
				     return model;
			
 
				-};
			
 
				+}
			
 
				 
			
 
				 extern "C" void fairseq2_model_free(fairseq2_model* model) {
			
 
				     if (model->tensors_ctx) ggml_free(model->tensors_ctx);
			
 
				     delete (std::uint64_t*)(model->arch);
			
 
				     delete (std::uint8_t*)model->hparams;
			
 
				     delete model;
			
 
				-};
			
 
				+}
			
 
				 
			
 
				 extern "C" void fairseq2_model_set_inference_ctx(fairseq2_model* model, ggml_context* ctx) {
			
 
				     model->ctx = ctx;
			
@@ -92,6 +95,21 @@ extern "C" ggml_tensor* StandardFeedForwardNetwork_forward(
 
				     return seqs;
			
 
				 }
			
 
				 
			
 
				+extern "C" ggml_tensor* SiluFeedForwardNetwork_forward(
			
 
				+    fairseq2_model& model,
			
 
				+    const std::string& prefix,
			
 
				+    ggml_tensor* seqs
			
 
				+) {
			
 
				+    seqs = Linear_forward(model, prefix + ".inner_proj", seqs);
			
 
				+    seqs = ggml_silu(model.ctx, seqs);
			
 
				+
			
 
				+    if (has_layer(model, prefix + ".inner_layer_norm")) {
			
 
				+        seqs = LayerNorm_forward(model, prefix + ".inner_layer_norm", seqs);
			
 
				+    }
			
 
				+
			
 
				+    seqs = Linear_forward(model, prefix + ".output_proj", seqs);
			
 
				+    return seqs;
			
 
				+}
			
 
				 
			
 
				 ggml_tensor* ggml_flatten_1d(ggml_context* ctx, ggml_tensor* x, int dim) {
			
 
				     int n_dims = x->n_dims;
			
@@ -286,6 +304,321 @@ extern "C" ggml_tensor* StandardTransformerEncoderLayer_forward(
 
				     return seqs;
			
 
				 }
			
 
				 
			
 
				+extern "C" ggml_tensor* WaveformToFbank_forward(
			
 
				+    fairseq2_model& model,
			
 
				+    const std::string &prefix,
			
 
				+    ggml_tensor* waveform 
			
 
				+) {
			
 
				+    // Hardcoding: num_bins 80, sample rate 16k, always standardize
			
 
				+    ggml_context* ctx = model.ctx;
			
 
				+    knf::MelBanksOptions mel_opts{};
			
 
				+    mel_opts.num_bins = 80;
			
 
				+
			
 
				+    knf::FrameExtractionOptions frame_opts{};
			
 
				+    frame_opts.samp_freq = 16000;
			
 
				+
			
 
				+    knf::FbankOptions opts{};
			
 
				+    opts.frame_opts = frame_opts;
			
 
				+    opts.mel_opts = mel_opts;
			
 
				+    
			
 
				+
			
 
				+    std::vector<float_t> signal_frame{};
			
 
				+    std::int32_t num_frames = knf::NumFrames(/*num_samples=*/waveform->ne[0], frame_opts);
			
 
				+    struct ggml_tensor * output = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 80, num_frames);
			
 
				+    knf::FbankComputer native_(opts);
			
 
				+    knf::FeatureWindowFunction window_fn_(native_.GetFrameOptions());
			
 
				+
			
 
				+    for (std::int32_t frame_nr = 0; frame_nr < num_frames; ++frame_nr) {
			
 
				+        signal_frame.resize(0);
			
 
				+
			
 
				+        // Extract the frame from the waveform tensor.
			
 
				+        knf::ExtractWindow(
			
 
				+            /*sample_offset=*/0,
			
 
				+            (float *)(waveform->data),
			
 
				+            waveform->ne[0],
			
 
				+            frame_nr,
			
 
				+            frame_opts,
			
 
				+            window_fn_,
			
 
				+            &signal_frame);
			
 
				+
			
 
				+        native_.Compute(
			
 
				+            /*signal_raw_log_energy=*/0, /*vtln_warp=*/1.0, &signal_frame, ((float *)(output->data) + frame_nr * 80));
			
 
				+    }
			
 
				+    output = ggml_dup(ctx, ggml_transpose(ctx, output));
			
 
				+    output = ggml_norm(ctx, output, 1e-5);
			
 
				+    output = ggml_dup(ctx, ggml_transpose(ctx, output));
			
 
				+    if (output->ne[1] % 2 == 1) {
			
 
				+        struct ggml_tensor * remove_last = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, output->ne[1]-1);
			
 
				+        for (int i = 0; i < output->ne[1]-1; ++i) {
			
 
				+            ((int32_t *) remove_last->data)[i] = i;
			
 
				+        }
			
 
				+        output = ggml_get_rows(ctx, output, remove_last);
			
 
				+    }
			
 
				+    output = ggml_reshape_2d(ctx, output, output->ne[0] * 2, output->ne[1] / 2);
			
 
				+    return output;
			
 
				+}
			
 
				+
			
 
				+// TODO: Check if it's possible to merge with standard MHA
			
 
				+extern "C" ggml_tensor* RelativePositionMHA_forward(
			
 
				+    fairseq2_model& model,
			
 
				+    const std::string& prefix,
			
 
				+    ggml_tensor* seqs
			
 
				+) {
			
 
				+    ggml_context* ctx = model.ctx;
			
 
				+
			
 
				+    ggml_tensor* residual = seqs;
			
 
				+    seqs = LayerNorm_forward(model, prefix + "_layer_norm", seqs);
			
 
				+    // self_attn: qkv
			
 
				+    struct ggml_tensor * Qcur = Linear_forward(model, prefix + ".q_proj", seqs);
			
 
				+    struct ggml_tensor * Kcur = Linear_forward(model, prefix + ".k_proj", seqs);
			
 
				+    struct ggml_tensor * Vcur = Linear_forward(model, prefix + ".v_proj", seqs);
			
 
				+    
			
 
				+    // self_attn: rel_pos SDPA
			
 
				+    int32_t S = seqs->ne[1];
			
 
				+    int32_t H = 16; // TODO: Make this configurable
			
 
				+    int32_t n_ctx = 4096;
			
 
				+    int32_t K_h = seqs->ne[0] / H;
			
 
				+    
			
 
				+    int32_t start_index = n_ctx - S;
			
 
				+    int32_t end_index = n_ctx + S - 1;
			
 
				+
			
 
				+    int num_indices = end_index - start_index;
			
 
				+
			
 
				+    struct ggml_tensor *rows = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, num_indices);
			
 
				+    rows->data = malloc(ggml_nbytes(rows));
			
 
				+
			
 
				+    for (int i = 0; i < num_indices; i++) {
			
 
				+        ((int32_t *)rows->data)[i] = start_index + i;
			
 
				+    }
			
 
				+    
			
 
				+    // self_attn: load pos_enc weights & compute_r
			
 
				+    // In fairseq2 pos_enc weights are calculated on the fly, since some more custom operators might be needed to enable this, 
			
 
				+    // we store the results (fixed) in checkpoint as model.audio_enc_pos_enc_w and load directly. 
			
 
				+    struct ggml_tensor * r = ggml_get_rows(ctx, model.tensors["speech_encoder.pos_enc"], rows);
			
 
				+    r = ggml_mul_mat(ctx, model.tensors[prefix + ".sdpa.r_proj.weight"], r);
			
 
				+    r = ggml_dup(ctx, ggml_permute(ctx,
			
 
				+                        ggml_cpy(ctx,
			
 
				+                            r,
			
 
				+                            ggml_new_tensor_3d(ctx, GGML_TYPE_F32, K_h, H, S*2-1)),
			
 
				+                        0, 2, 1, 3));
			
 
				+    
			
 
				+    struct ggml_tensor * u_bias = ggml_reshape_3d(ctx, model.tensors[prefix + ".sdpa.u_bias"], K_h, 1, H);
			
 
				+    struct ggml_tensor * v_bias = ggml_reshape_3d(ctx, model.tensors[prefix + ".sdpa.v_bias"], K_h, 1, H);
			
 
				+
			
 
				+    // self_attn: Permute QKV
			
 
				+    
			
 
				+    struct ggml_tensor * Q =
			
 
				+                ggml_dup(ctx, ggml_permute(ctx,
			
 
				+                        ggml_cpy(ctx,
			
 
				+                            Qcur,
			
 
				+                            ggml_new_tensor_3d(ctx, GGML_TYPE_F32, K_h, H, S)),
			
 
				+                        0, 2, 1, 3)); // (H * K_h, S) -> (K_h, H, S) -> (K_h, S, H)
			
 
				+    struct ggml_tensor * K = 
			
 
				+                ggml_dup(ctx, ggml_permute(ctx,
			
 
				+                        ggml_cpy(ctx,
			
 
				+                            Kcur,
			
 
				+                            ggml_new_tensor_3d(ctx, GGML_TYPE_F32, K_h, H, S)),
			
 
				+                        0, 2, 1, 3)); // (H * K_h, S) -> (K_h, H, S) -> (K_h, S, H)
			
 
				+    struct ggml_tensor * V = 
			
 
				+                ggml_dup(ctx, ggml_permute(ctx,
			
 
				+                        ggml_cpy(ctx,
			
 
				+                            Vcur,
			
 
				+                            ggml_new_tensor_3d(ctx, GGML_TYPE_F32, K_h, H, S)),
			
 
				+                        1, 2, 0, 3)); // (H * K_h, S) -> (K_h, H, S) -> (H, S, K_h)
			
 
				+    
			
 
				+    
			
 
				+    struct ggml_tensor * q_with_u_bias = ggml_add(ctx, Q, u_bias); // (K_h, S, H)
			
 
				+    struct ggml_tensor * q_with_v_bias = ggml_add(ctx, Q, v_bias); // (K_h, S, H)
			
 
				+    
			
 
				+    struct ggml_tensor * ac = ggml_mul_mat(ctx, K, q_with_u_bias);
			
 
				+    struct ggml_tensor * bd = ggml_mul_mat(ctx, r, q_with_v_bias);
			
 
				+    
			
 
				+    
			
 
				+    // self_attn: shift_bd. Logic follows https://github.com/facebookresearch/fairseq2/blob/main/src/fairseq2/nn/transformer/relative_attention.py#L161
			
 
				+    bd = ggml_dup(ctx, ggml_permute(ctx, bd, 2, 1, 0, 3)); // H, S, 2S-1
			
 
				+    
			
 
				+    struct ggml_tensor * pad = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, H, S, 1);
			
 
				+    pad->data = malloc(ggml_nbytes(pad));
			
 
				+
			
 
				+    pad = ggml_set_f32(pad, 0.0);
			
 
				+    bd = ggml_concat(ctx, pad, bd); // bd[i][j][0] == 0, (H, S, 2S)
			
 
				+    bd = ggml_dup(ctx, ggml_permute(ctx, bd, 2, 1, 0, 3)); // (2S, S, H) 
			
 
				+    bd = ggml_dup(ctx, ggml_reshape_3d(ctx, bd, S, 2*S, H));  // (S, 2S, H)
			
 
				+    bd = ggml_remove_head_row(ctx, bd); // A custom operator introduced to reduce 1st row (in the 2nd dim)
			
 
				+
			
 
				+    bd = ggml_reshape_3d(ctx, bd, 2*S-1, S, H);
			
 
				+
			
 
				+    bd = ggml_get_first_cols_by_rows(ctx, bd); // A custom operator introduced to get first #rows cols. 
			
 
				+    
			
 
				+
			
 
				+    // self_attn: compute attn / weights
			
 
				+    struct ggml_tensor * attn_weights = ggml_add(ctx, ac, bd);
			
 
				+    struct ggml_tensor * attn_scale = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, 1);
			
 
				+    attn_scale->data = malloc(ggml_nbytes(attn_scale));
			
 
				+    ggml_set_f32(attn_scale, 1.0 / pow(K_h, 0.5));
			
 
				+    attn_weights = ggml_mul(ctx, ggml_repeat(ctx, attn_scale, attn_weights), attn_weights);
			
 
				+    attn_weights = ggml_soft_max(ctx, attn_weights);
			
 
				+    
			
 
				+    struct ggml_tensor * attn = ggml_mul_mat(ctx, V, attn_weights); // K_h, S, H
			
 
				+    attn = ggml_dup(ctx, ggml_permute(ctx, attn, 0, 2, 1, 3));
			
 
				+    struct ggml_tensor * attn_2d = ggml_reshape_2d(ctx, attn, K_h * H, S); 
			
 
				+    
			
 
				+    struct ggml_tensor * attn_out = ggml_mul_mat(ctx, model.tensors[prefix + ".output_proj.weight"], attn_2d);
			
 
				+    attn_out = ggml_add(ctx,
			
 
				+            ggml_repeat(ctx,
			
 
				+                model.tensors[prefix + ".output_proj.bias"],
			
 
				+                attn_out),
			
 
				+            attn_out);
			
 
				+    attn_out = ggml_add(ctx, residual, attn_out);
			
 
				+    return attn_out;
			
 
				+}
			
 
				+
			
 
				+extern "C" ggml_tensor* ConvModule_forward(
			
 
				+    fairseq2_model& model,
			
 
				+    const std::string& prefix,
			
 
				+    ggml_tensor* seqs
			
 
				+) {
			
 
				+        ggml_context* ctx = model.ctx;
			
 
				+        ggml_tensor* residual = seqs;
			
 
				+        seqs = LayerNorm_forward(model, prefix + "_layer_norm", seqs);
			
 
				+        // conv: Use matmul for pointwise conv 1 - kernel_size=1, no padding case
			
 
				+        seqs = ggml_mul_mat(ctx, model.tensors[prefix + ".pointwise_conv1.weight"], seqs);
			
 
				+        
			
 
				+        // conv: GLU
			
 
				+        seqs = ggml_glu(ctx, seqs);
			
 
				+        seqs = ggml_dup(ctx, ggml_permute(ctx, seqs, 1, 0, 2, 3));
			
 
				+       
			
 
				+        // S x C -> (S+K-1) x C -> K x S x C -> S x C
			
 
				+        seqs = ggml_conv_1d(ctx, model.tensors[prefix + ".depthwise_conv.weight"], seqs, 1, 15, 1);
			
 
				+        
			
 
				+        // conv: Custom implementation of batch norm
			
 
				+        seqs = ggml_batch_norm(ctx, seqs, model.tensors[prefix + ".batch_norm.weight"], model.tensors[prefix + ".batch_norm.bias"], model.tensors[prefix + ".batch_norm.running_mean"], model.tensors[prefix + ".batch_norm.running_var"], 1e-5);
			
 
				+        
			
 
				+        // conv: SiLU actvation
			
 
				+        seqs = ggml_silu(ctx, seqs);
			
 
				+        seqs = ggml_dup(ctx, ggml_permute(ctx, seqs, 1, 0, 2, 3));
			
 
				+
			
 
				+        // conv: Use matmul for pointwise conv 2 - kernel_size=1, no padding case
			
 
				+        seqs = ggml_mul_mat(ctx, model.tensors[prefix + ".pointwise_conv2.weight"], seqs); 
			
 
				+
			
 
				+        // conv: + residual
			
 
				+        seqs = ggml_add(ctx, seqs, residual);
			
 
				+        return seqs;
			
 
				+}
			
 
				+
			
 
				+extern "C" ggml_tensor* StandardConformerEncoderLayer_forward(
			
 
				+    fairseq2_model& model,
			
 
				+    const std::string& prefix,
			
 
				+    ggml_tensor* seqs,
			
 
				+    ggml_tensor* padding_mask
			
 
				+) {
			
 
				+    ggml_context* ctx = model.ctx;
			
 
				+    struct ggml_tensor * ffn_scale = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, 1);
			
 
				+    ffn_scale->data = malloc(ggml_nbytes(ffn_scale));
			
 
				+    ggml_set_f32(ffn_scale, 0.5f);
			
 
				+    struct ggml_tensor * residual = seqs;
			
 
				+    seqs = LayerNorm_forward(model, prefix + ".ffn1_layer_norm", seqs);
			
 
				+    seqs = SiluFeedForwardNetwork_forward(model, prefix + ".ffn1", seqs);
			
 
				+    seqs = ggml_mul(ctx, ggml_repeat(ctx, ffn_scale, seqs), seqs);
			
 
				+    seqs = ggml_add(ctx, seqs, residual);
			
 
				+    seqs = RelativePositionMHA_forward(model, prefix + ".self_attn", seqs);
			
 
				+    seqs = ConvModule_forward(model, prefix + ".conv", seqs);
			
 
				+    residual = seqs;
			
 
				+    seqs = LayerNorm_forward(model, prefix + ".ffn2_layer_norm", seqs);
			
 
				+    seqs = SiluFeedForwardNetwork_forward(model, prefix + ".ffn2", seqs);
			
 
				+    seqs = ggml_mul(ctx, ggml_repeat(ctx, ffn_scale, seqs), seqs);
			
 
				+    seqs = ggml_add(ctx, seqs, residual);
			
 
				+    seqs = LayerNorm_forward(model, prefix + ".layer_norm", seqs);
			
 
				+    return seqs;
			
 
				+}
			
 
				+
			
 
				+extern "C" ggml_tensor* StandardConformerEncoder_forward(
			
 
				+    fairseq2_model& model,
			
 
				+    const std::string& prefix,
			
 
				+    ggml_tensor* seqs,
			
 
				+    ggml_tensor* padding_mask
			
 
				+) { // TODO: Implement this!
			
 
				+    ggml_context* ctx = model.ctx;
			
 
				+    seqs = WaveformToFbank_forward(model, prefix, seqs);
			
 
				+    seqs = LayerNorm_forward(model, prefix + "_frontend.post_extract_layer_norm", seqs);
			
 
				+    seqs = Linear_forward(model, prefix + "_frontend.model_dim_proj", seqs);
			
 
				+    int layer_idx = 0;
			
 
				+    
			
 
				+    std::string layer_name = prefix + ".inner.layers." + std::to_string(layer_idx);
			
 
				+    
			
 
				+    while (has_layer(model, layer_name)) {
			
 
				+        seqs = StandardConformerEncoderLayer_forward(
			
 
				+            model, layer_name, seqs, padding_mask
			
 
				+        );
			
 
				+        ggml_set_name(seqs, ("x_enc_" + std::to_string(layer_idx)).c_str());
			
 
				+        layer_idx += 1;
			
 
				+        layer_name = prefix + ".inner.layers." + std::to_string(layer_idx);
			
 
				+    }
			
 
				+
			
 
				+    seqs = LayerNorm_forward(model, prefix + ".inner_layer_norm", seqs);
			
 
				+    ggml_tensor* residual = seqs;
			
 
				+    seqs = Linear_forward(model, prefix + ".proj1", seqs);
			
 
				+    seqs = ggml_relu_inplace(ctx, seqs);
			
 
				+    seqs = Linear_forward(model, prefix + ".proj2", seqs);
			
 
				+    struct ggml_tensor * ffn_scale = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, 1);
			
 
				+    ffn_scale->data = malloc(ggml_nbytes(ffn_scale));
			
 
				+    ggml_set_f32(ffn_scale, 0.5f);
			
 
				+    seqs = ggml_mul(ctx, ggml_repeat(ctx, ffn_scale, seqs), seqs);
			
 
				+    seqs = ggml_add(ctx, seqs, residual);
			
 
				+    layer_idx = 0;
			
 
				+    layer_name = prefix + ".adaptor_layers." + std::to_string(layer_idx);
			
 
				+    while (has_layer(model, layer_name)) {
			
 
				+        seqs = StandardConformerEncoderAdaptorLayer_forward(
			
 
				+            model, layer_name, seqs, padding_mask
			
 
				+        );
			
 
				+        ggml_set_name(seqs, ("x_ada_" + std::to_string(layer_idx)).c_str());
			
 
				+        layer_idx += 1;
			
 
				+        layer_name = prefix + ".adaptor_layers." + std::to_string(layer_idx);
			
 
				+    }
			
 
				+    seqs = LayerNorm_forward(model, prefix + ".layer_norm", seqs);
			
 
				+    
			
 
				+    return seqs;
			
 
				+}
			
 
				+
			
 
				+extern "C" ggml_tensor* StandardConformerEncoderAdaptorLayer_forward(
			
 
				+    fairseq2_model& model,
			
 
				+    const std::string& prefix,
			
 
				+    ggml_tensor* seqs,
			
 
				+    ggml_tensor* padding_mask
			
 
				+) {
			
 
				+    ggml_context* ctx = model.ctx;
			
 
				+    struct ggml_tensor * residual = seqs;
			
 
				+    residual = LayerNorm_forward(model, prefix + ".residual_layer_norm", residual);
			
 
				+    residual = ggml_dup(ctx, ggml_permute(ctx, residual, 1, 0, 2, 3));
			
 
				+    residual = ggml_conv_1d_generic(ctx, model.tensors[prefix + ".residual_conv.weight"], residual, 8, 4, 1); 
			
 
				+    residual = ggml_dup(ctx, ggml_permute(ctx, residual, 1, 0, 2, 3));
			
 
				+    residual = ggml_add(ctx, ggml_repeat(ctx, model.tensors[prefix + ".residual_conv.bias"], residual), residual);
			
 
				+    residual = ggml_glu(ctx, residual);
			
 
				+
			
 
				+    seqs = LayerNorm_forward(model, prefix + ".self_attn_layer_norm", seqs);
			
 
				+    seqs = ggml_dup(ctx, ggml_permute(ctx, seqs, 1, 0, 2, 3));
			
 
				+    seqs = ggml_conv_1d_generic(ctx, model.tensors[prefix + ".self_attn_conv.weight"], seqs, 8, 4, 1);
			
 
				+    seqs = ggml_dup(ctx, ggml_permute(ctx, seqs, 1, 0, 2, 3));
			
 
				+    seqs = ggml_add(ctx, ggml_repeat(ctx, model.tensors[prefix + ".self_attn_conv.bias"], seqs), seqs);
			
 
				+    seqs = ggml_glu(ctx, seqs); 
			
 
				+    
			
 
				+    seqs = MultiheadAttention_forward(
			
 
				+        model,
			
 
				+        prefix + ".self_attn",
			
 
				+        seqs,
			
 
				+        seqs,
			
 
				+        seqs,
			
 
				+        /*attention masks=*/nullptr
			
 
				+    );
			
 
				+    seqs = ggml_add(ctx, seqs, residual);
			
 
				+    residual = seqs;
			
 
				+    seqs = LayerNorm_forward(model, prefix + ".ffn_layer_norm", seqs);
			
 
				+    seqs = StandardFeedForwardNetwork_forward(model, prefix + ".ffn", seqs);
			
 
				+    seqs = ggml_add(ctx, seqs, residual);
			
 
				+    return seqs;
			
 
				+}
			
 
				+
			
 
				+
			
 
				 /// ggml_slice(X, -1, start, end) is equivalent to X[start:end]
			
 
				 /// ggml_slice(X, 0, start, end) is equivalent to X[..., start:end]
			
 
				 struct ggml_tensor * ggml_slice(
			
@@ -905,7 +1238,7 @@ extern "C" Hypothesis* generate_sequence(
 
				             ggml_graph_compute_with_ctx(ctx, &gf, 1);
			
 
				             ggml_detach(new_scores);
			
 
				         }
			
 
				-
			
 
				+        
			
 
				         // new_seqs[:, step_nr + 1] = next_tokens
			
 
				         // new_scores[:, step_nr + 1] = next_scores
			
 
				         for (std::size_t i = 0; i < beam_size; ++i) {
			
--- a/ggml/examples/unity/fairseq2.h
+++ b/ggml/examples/unity/fairseq2.h
@@ -4,6 +4,7 @@
 
				 #include <string>
			
 
				 #include <vector>
			
 
				 #include "ggml.h"
			
 
				+#include "kaldi-native-fbank/csrc/feature-fbank.h"
			
 
				 
			
 
				 
			
 
				 struct fairseq2_model {
			
@@ -27,6 +28,11 @@ extern "C" void fairseq2_model_set_inference_ctx(fairseq2_model* model, ggml_con
 
				 extern "C" std::string* std_string_alloc(char* c_str);
			
 
				 extern "C" void std_string_free(std::string* str);
			
 
				 
			
 
				+extern "C" ggml_tensor* WaveformToFbank_forward(
			
 
				+    fairseq2_model& model,
			
 
				+    const std::string &prefix,
			
 
				+    ggml_tensor* waveform 
			
 
				+);
			
 
				 extern "C" ggml_tensor* ggml_slice(
			
 
				     struct ggml_context* ctx,
			
 
				     struct ggml_tensor* a,
			
@@ -64,6 +70,12 @@ extern "C" ggml_tensor* StandardFeedForwardNetwork_forward(
 
				     ggml_tensor* seqs
			
 
				 );
			
 
				 
			
 
				+extern "C" ggml_tensor* SiluFeedForwardNetwork_forward(
			
 
				+    fairseq2_model& model,
			
 
				+    const std::string& prefix,
			
 
				+    ggml_tensor* seqs
			
 
				+);
			
 
				+
			
 
				 extern "C" ggml_tensor* MultiheadAttention_forward(
			
 
				     fairseq2_model& model,
			
 
				     const std::string &prefix,
			
@@ -93,6 +105,45 @@ extern "C" ggml_tensor* StandardTransformerEncoderLayer_forward(
 
				     ggml_tensor* padding_mask
			
 
				 );
			
 
				 
			
 
				+extern "C" ggml_tensor* RelativePositionMHA_forward(
			
 
				+    fairseq2_model& model,
			
 
				+    const std::string& prefix,
			
 
				+    ggml_tensor* seqs
			
 
				+);
			
 
				+
			
 
				+extern "C" ggml_tensor* ConvModule_forward(
			
 
				+    fairseq2_model& model,
			
 
				+    const std::string& prefix,
			
 
				+    ggml_tensor* seqs
			
 
				+);
			
 
				+
			
 
				+extern "C" ggml_tensor* StandardConformerEncoderLayer_forward(
			
 
				+    fairseq2_model& model,
			
 
				+    const std::string& prefix,
			
 
				+    ggml_tensor* seqs,
			
 
				+    ggml_tensor* padding_mask
			
 
				+);
			
 
				+
			
 
				+extern "C" ggml_tensor* StandardConformerEncoder_forward(
			
 
				+    fairseq2_model& model,
			
 
				+    const std::string& prefix,
			
 
				+    ggml_tensor* seqs,
			
 
				+    ggml_tensor* padding_mask
			
 
				+);
			
 
				+
			
 
				+extern "C" ggml_tensor* StandardConformerEncoderAdaptorLayer_forward(
			
 
				+    fairseq2_model& model,
			
 
				+    const std::string& prefix,
			
 
				+    ggml_tensor* seqs,
			
 
				+    ggml_tensor* padding_mask
			
 
				+);
			
 
				+
			
 
				+extern "C" ggml_tensor* StandardConformerEncoderAdaptor_forward(
			
 
				+    fairseq2_model& model,
			
 
				+    const std::string& prefix,
			
 
				+    ggml_tensor* seqs,
			
 
				+    ggml_tensor* padding_mask
			
 
				+);
			
 
				 // Specifies the Layer Normalization order.
			
 
				 enum TransformerNormOrder {
			
 
				     TRANSFORMER_NORM_ORDER_POST = 0,
			
--- a/ggml/ggml_convert.py
+++ b/ggml/ggml_convert.py
@@ -11,6 +11,7 @@ from enum import Enum
 
				 from io import BufferedWriter
			
 
				 from pathlib import Path
			
 
				 from typing import Any, Callable, Dict, Optional, Tuple, Union
			
 
				+import math
			
 
				 import torch
			
 
				 import ggml
			
 
				 from typing import List
			
@@ -21,6 +22,49 @@ from seamless_communication.models.unity import load_unity_config, load_unity_mo
 
				 
			
 
				 Preprocessor = Callable[[Any], Any]
			
 
				 
			
 
				+def pos_enc(max_seq_len=4096, encoding_dim=1024):
			
 
				+    weight = torch.empty(
			
 
				+        ((max_seq_len * 2) - 1, encoding_dim), dtype=torch.float32
			
 
				+    )
			
 
				+    # copied from https://github.com/facebookresearch/fairseq2/blob/main/src/fairseq2/nn/transformer/relative_attention.py#L22
			
 
				+    dtype = torch.float32
			
 
				+    weight = weight.to(dtype)
			
 
				+
			
 
				+    positive_w = weight[: max_seq_len]
			
 
				+    negative_w = weight[max_seq_len :]
			
 
				+
			
 
				+    device = weight.device
			
 
				+
			
 
				+    # (E / 2)
			
 
				+    indices = torch.arange(0, encoding_dim, step=2, device=device, dtype=dtype)
			
 
				+
			
 
				+    # (1, E / 2)
			
 
				+    indices = indices.unsqueeze(0)
			
 
				+
			
 
				+    # (S)
			
 
				+    steps = torch.arange(max_seq_len, device=device, dtype=dtype)
			
 
				+
			
 
				+    # (S, 1)
			
 
				+    steps = steps.unsqueeze(1)
			
 
				+
			
 
				+    factors = torch.exp(indices * -math.log(10000) / encoding_dim)
			
 
				+
			
 
				+    # (S, 1) x (1, E / 2) -> (S, E / 2)
			
 
				+    factors = torch.matmul(steps, factors)
			
 
				+
			
 
				+    flipped_factors = factors.flip([0])
			
 
				+
			
 
				+    # A mirrored matrix of sinusoidal positive and negative positional
			
 
				+    # encodings to use in shift trick.
			
 
				+    #
			
 
				+    # [max, ...,  3,  2,  1,  0, -1, -2, -3, ..., min]
			
 
				+    torch.sin(flipped_factors, out=positive_w[:, 0::2])
			
 
				+    torch.cos(flipped_factors, out=positive_w[:, 1::2])
			
 
				+
			
 
				+    torch.sin(-1 * factors[1:], out=negative_w[:, 0::2])
			
 
				+    torch.cos(-1 * factors[1:], out=negative_w[:, 1::2])
			
 
				+
			
 
				+    return weight
			
 
				 
			
 
				 def convert_model(model_name: str, out: Optional[Path] = None) -> None:
			
 
				     if out is None:
			
@@ -87,6 +131,7 @@ def fixup_model(model: torch.nn.Module, state_dict: Dict[str, torch.Tensor]) ->
 
				         assert name not in state_dict
			
 
				         state_dict[name] = pos_encoder.weight
			
 
				 
			
 
				+    state_dict["speech_encoder.pos_enc"] = pos_enc()
			
 
				 
			
 
				 def write_ggml_file(
			
 
				     out: BufferedWriter, hparams: Dict[str, Any], state_dict: Dict[str, torch.Tensor]
			
@@ -142,9 +187,13 @@ def write_state_dict(out: BufferedWriter, state_dict: Dict[str, torch.Tensor]) -
 
				     """
			
 
				     for key, value in state_dict.items():
			
 
				         write_string(out, key)
			
 
				-        if key.endswith(".bias") and value.ndim == 1:
			
 
				+        if key.endswith(".bias") and value.ndim == 1 and "adaptor" not in key:
			
 
				             # GGML broadcasting isn't as strong as numpy
			
 
				             value = value.reshape(1, -1)
			
 
				+        if "pointwise_conv" in key: # pointwise_conv / depthwise_conv
			
 
				+            value = value.squeeze(-1)
			
 
				+        if "depthwise_conv" in key:
			
 
				+            value = value.squeeze(1)
			
 
				         write_tensor(out, value.contiguous())
			
 
				 
			
 
				 
			
--- a/ggml/include/ggml/ggml.h
+++ b/ggml/include/ggml/ggml.h
@@ -347,7 +347,7 @@ extern "C" {
 
				         GGML_OP_NONE = 0,
			
 
				 
			
 
				         GGML_OP_DUP,
			
 
				-        GGML_OP_ADD, //2
			
 
				+        GGML_OP_ADD,
			
 
				         GGML_OP_ADD1,
			
 
				         GGML_OP_ACC,
			
 
				         GGML_OP_SUB,
			
@@ -367,19 +367,21 @@ extern "C" {
 
				         GGML_OP_GET_FIRST_COLS_BY_ROWS,
			
 
				         GGML_OP_SILU_BACK,
			
 
				         GGML_OP_NORM, // normalize
			
 
				+        GGML_OP_BATCH_NORM, 
			
 
				         GGML_OP_RMS_NORM,
			
 
				         GGML_OP_RMS_NORM_BACK,
			
 
				         GGML_OP_GROUP_NORM,
			
 
				 
			
 
				-        GGML_OP_MUL_MAT, //23
			
 
				+        GGML_OP_MUL_MAT,
			
 
				         GGML_OP_OUT_PROD,
			
 
				+
			
 
				         GGML_OP_SCALE,
			
 
				         GGML_OP_SET,
			
 
				         GGML_OP_CPY,
			
 
				         GGML_OP_CONT,
			
 
				-        GGML_OP_RESHAPE, //29
			
 
				+        GGML_OP_RESHAPE,
			
 
				         GGML_OP_VIEW,
			
 
				-        GGML_OP_PERMUTE, //32
			
 
				+        GGML_OP_PERMUTE,
			
 
				         GGML_OP_TRANSPOSE,
			
 
				         GGML_OP_GET_ROWS,
			
 
				         GGML_OP_GET_ROWS_BACK,
			
@@ -393,11 +395,19 @@ extern "C" {
 
				         GGML_OP_ALIBI,
			
 
				         GGML_OP_CLAMP,
			
 
				         GGML_OP_CONV_1D,
			
 
				+        GGML_OP_CONV_1D_GENERIC,
			
 
				         GGML_OP_CONV_2D,
			
 
				         GGML_OP_CONV_TRANSPOSE_2D,
			
 
				         GGML_OP_POOL_1D,
			
 
				         GGML_OP_POOL_2D,
			
 
				 
			
 
				+        GGML_OP_CONV_1D_STAGE_0,  // internal
			
 
				+        GGML_OP_CONV_1D_STAGE_1,  // internal
			
 
				+        GGML_OP_CONV_1D_STAGE_2,  // internal
			
 
				+
			
 
				+        GGML_OP_CONV_1D_GENERIC_STAGE_0,
			
 
				+        GGML_OP_CONV_1D_GENERIC_STAGE_1,  
			
 
				+
			
 
				         GGML_OP_UPSCALE, // nearest interpolate
			
 
				 
			
 
				         GGML_OP_FLASH_ATTN,
			
@@ -438,6 +448,7 @@ extern "C" {
 
				         GGML_UNARY_OP_GELU,
			
 
				         GGML_UNARY_OP_GELU_QUICK,
			
 
				         GGML_UNARY_OP_SILU,
			
 
				+        GGML_UNARY_OP_GLU,
			
 
				     };
			
 
				 
			
 
				     enum ggml_object_type {
			
@@ -937,6 +948,10 @@ extern "C" {
 
				             struct ggml_tensor  * a,
			
 
				             struct ggml_tensor  * b);
			
 
				 
			
 
				+    GGML_API struct ggml_tensor * ggml_glu(
			
 
				+            struct ggml_context * ctx,
			
 
				+            struct ggml_tensor  * a);
			
 
				+
			
 
				     // normalize along rows
			
 
				     GGML_API struct ggml_tensor * ggml_norm(
			
 
				             struct ggml_context * ctx,
			
@@ -948,6 +963,15 @@ extern "C" {
 
				             struct ggml_tensor  * a,
			
 
				             float                 eps);
			
 
				 
			
 
				+    GGML_API struct ggml_tensor * ggml_batch_norm(
			
 
				+            struct ggml_context * ctx,
			
 
				+            struct ggml_tensor  * a,
			
 
				+            struct ggml_tensor  * gamma,
			
 
				+            struct ggml_tensor  * beta,
			
 
				+            struct ggml_tensor  * running_mean,
			
 
				+            struct ggml_tensor  * running_var,
			
 
				+            float eps);
			
 
				+
			
 
				     GGML_API struct ggml_tensor * ggml_rms_norm(
			
 
				             struct ggml_context * ctx,
			
 
				             struct ggml_tensor  * a,
			
@@ -1319,6 +1343,14 @@ extern "C" {
 
				             int                   p0,  // padding
			
 
				             int                   d0); // dilation
			
 
				 
			
 
				+    GGML_API struct ggml_tensor * ggml_conv_1d_generic(
			
 
				+            struct ggml_context * ctx,
			
 
				+            struct ggml_tensor  * a,
			
 
				+            struct ggml_tensor  * b,
			
 
				+            int                   s0,  // stride
			
 
				+            int                   p0,  // padding
			
 
				+            int                   d0); // dilation
			
 
				+
			
 
				     // conv_1d with padding = half
			
 
				     // alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
			
 
				     GGML_API struct ggml_tensor* ggml_conv_1d_ph(
			
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -271,9 +271,9 @@ target_include_directories(${TARGET} PUBLIC
 
				     )
			
 
				 
			
 
				 if (MSVC)
			
 
				-    target_link_libraries(${TARGET} PUBLIC ${GGML_EXTRA_LIBS} ${CMAKE_THREAD_LIBS_INIT})
			
 
				+    target_link_libraries(${TARGET} PUBLIC ${GGML_EXTRA_LIBS} ${CMAKE_THREAD_LIBS_INIT} kaldi-native-fbank)
			
 
				 else()
			
 
				-    target_link_libraries(${TARGET} PUBLIC m ${GGML_EXTRA_LIBS} ${CMAKE_THREAD_LIBS_INIT})
			
 
				+    target_link_libraries(${TARGET} PUBLIC m ${GGML_EXTRA_LIBS} ${CMAKE_THREAD_LIBS_INIT} kaldi-native-fbank)
			
 
				 endif()
			
 
				 
			
 
				 if (BUILD_SHARED_LIBS)
			
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
--- a/ggml/test_unity_cpp.py
+++ b/ggml/test_unity_cpp.py
@@ -18,7 +18,9 @@ from typing import Iterator
 
				 from ggml import NativeObj
			
 
				 from ggml_convert import convert_model
			
 
				 from seamless_communication.models.inference.translator import Translator, Modality
			
 
				-
			
 
				+from fairseq2.data.audio import WaveformToFbankConverter
			
 
				+import torchaudio
			
 
				+from fairseq2.models.wav2vec2.feature_extractor import Wav2Vec2FbankFeatureExtractor
			
 
				 Ctx = ggml.ggml_context_p
			
 
				 
			
 
				 UNITY_MODELS = Path(__file__).parent / "examples/unity/models"
			
@@ -253,6 +255,60 @@ def test_StandardTransformerEncoderLayer_forward(
 
				     assert y.shape == y_exp.shape
			
 
				     assert np.allclose(y_exp, y, atol=1e-4 if UNITY_FLASH_ATTN else 1e-2)
			
 
				 
			
 
				+def test_StandardConformerEncoderLayer_forward(
			
 
				+    ctx: Ctx, g_model: c_void_p
			
 
				+) -> None:
			
 
				+    pt_model = load_pt_model()
			
 
				+    x = torch.load("/private/home/dnn/internal_sc/seamless_communication/ggml/examples/unity/dev/seqs_before_conformer_block.pt")
			
 
				+    padding_mask = torch.ones((1, x.shape[1]))
			
 
				+    layer = pt_model.speech_encoder.inner.layers[0]
			
 
				+    gx = ggml.from_numpy(ctx, x[0])
			
 
				+    ggml.ggml_set_name(gx, b"x")
			
 
				+    gpad = ggml.from_numpy(ctx, padding_mask[0])
			
 
				+    ggml.ggml_set_name(gpad, b"padding_mask")
			
 
				+    gy = ggml.forward(
			
 
				+        "StandardConformerEncoderLayer",
			
 
				+        g_model,
			
 
				+        "speech_encoder.inner.layers.0",
			
 
				+        gx,
			
 
				+        None,  # TODO support padding mask
			
 
				+    )
			
 
				+    gf = ggml.ggml_build_forward(gy)
			
 
				+    ggml.ggml_graph_compute_with_ctx(ctx, ctypes.pointer(gf), 1)
			
 
				+
			
 
				+    y = ggml.to_numpy(gy)
			
 
				+
			
 
				+    y_exp, _ = layer(x, padding_mask)
			
 
				+    y_exp = y_exp.numpy()  
			
 
				+    assert y.shape == y_exp.shape
			
 
				+    assert np.allclose(y_exp, y, atol=2e-3)
			
 
				+
			
 
				+def test_StandardConformerEncoderAdaptorLayer_forward(
			
 
				+    ctx: Ctx, g_model: c_void_p
			
 
				+) -> None:
			
 
				+    pt_model = load_pt_model()
			
 
				+    x = torch.load("/private/home/dnn/internal_sc/seamless_communication/ggml/examples/unity/dev/seqs_before_adaptor.pt")
			
 
				+    layer = pt_model.speech_encoder.adaptor_layers[0]
			
 
				+    gx = ggml.from_numpy(ctx, x[0])
			
 
				+    ggml.ggml_set_name(gx, b"x")
			
 
				+    gy = ggml.forward(
			
 
				+        "StandardConformerEncoderAdaptorLayer",
			
 
				+        g_model,
			
 
				+        "speech_encoder.adaptor_layers.0",
			
 
				+        gx,
			
 
				+        None,  # TODO support padding mask
			
 
				+    )
			
 
				+    gf = ggml.ggml_build_forward(gy)
			
 
				+    ggml.ggml_graph_compute_with_ctx(ctx, ctypes.pointer(gf), 1)
			
 
				+
			
 
				+    y = ggml.to_numpy(gy)
			
 
				+
			
 
				+    y_exp, _ = layer(x, None)
			
 
				+    y_exp = y_exp.numpy() 
			
 
				+
			
 
				+    assert y.shape == y_exp.shape
			
 
				+    assert np.allclose(y_exp, y, atol=2e-3)
			
 
				+
			
 
				 
			
 
				 def test_StandardTransformerEncoder_forward(
			
 
				     ctx: Ctx, g_model: c_void_p
			
@@ -283,7 +339,97 @@ def test_StandardTransformerEncoder_forward(
 
				     y_exp = y_exp.numpy()
			
 
				 
			
 
				     assert y.shape == y_exp.shape
			
 
				-    assert np.allclose(y_exp, y, atol=1e-4 if UNITY_FLASH_ATTN else 1e-2)
			
 
				+    assert np.allclose(y_exp, y, atol=1e-4)
			
 
				+
			
 
				+def test_StandardConformerEncoder_forward(
			
 
				+    ctx: Ctx, g_model: c_void_p
			
 
				+) -> None:
			
 
				+    pt_model = load_pt_model()
			
 
				+    wav, _ = torchaudio.load("/private/home/dnn/internal_sc/seamless_communication/ggml/examples/unity/test.wav")
			
 
				+    gx = ggml.from_numpy(ctx, wav * 2**15) # Apply scale before sending into ggml!
			
 
				+    ggml.ggml_set_name(gx, b"x")
			
 
				+    gy = ggml.forward(
			
 
				+        "StandardConformerEncoder",
			
 
				+        g_model,
			
 
				+        "speech_encoder",
			
 
				+        gx,
			
 
				+        None,  # TODO support padding mask
			
 
				+    )
			
 
				+    gf = ggml.ggml_build_forward(gy)
			
 
				+    ggml.ggml_graph_compute_with_ctx(ctx, ctypes.pointer(gf), 1)
			
 
				+
			
 
				+    converter = WaveformToFbankConverter(
			
 
				+        num_mel_bins=80,
			
 
				+        waveform_scale=2**15,
			
 
				+        channel_last=True,
			
 
				+        standardize=True,
			
 
				+    )
			
 
				+    converter_input = {
			
 
				+        "waveform": wav.transpose(0, 1),
			
 
				+        "sample_rate": 16000.,
			
 
				+        "format": -1,
			
 
				+    }
			
 
				+
			
 
				+    y = ggml.to_numpy(gy)
			
 
				+    speech_encoder_input = pt_model.speech_encoder_frontend(converter(converter_input)["fbank"].unsqueeze(0), None)[0]
			
 
				+
			
 
				+    y_exp, _ = pt_model.speech_encoder(speech_encoder_input, None)
			
 
				+    y_exp = y_exp.numpy()  # remove batch dimension
			
 
				+
			
 
				+    assert y.shape == y_exp.shape
			
 
				+    assert np.allclose(y_exp, y, atol=1e-2) # There are 10 elements in a 137*1024 tensor with error >1e-2
			
 
				+
			
 
				+    
			
 
				+
			
 
				+def test_WaveformToFbank_forward(
			
 
				+    ctx: Ctx, g_model: c_void_p
			
 
				+) -> None:
			
 
				+    pt_model = load_pt_model()
			
 
				+    converter = WaveformToFbankConverter(
			
 
				+        num_mel_bins=80,
			
 
				+        waveform_scale=2**15,
			
 
				+        channel_last=True,
			
 
				+        standardize=True,
			
 
				+    )
			
 
				+    extractor = Wav2Vec2FbankFeatureExtractor(80, 2, 1)
			
 
				+    wav, _ = torchaudio.load("/private/home/dnn/internal_sc/seamless_communication/ggml/examples/unity/test.wav")
			
 
				+    gx = ggml.from_numpy(ctx, wav * 2**15) # Apply scale before sending into ggml!
			
 
				+    ggml.ggml_set_name(gx, b"x")
			
 
				+    
			
 
				+    gy = ggml.forward(
			
 
				+        "WaveformToFbank",
			
 
				+        g_model,
			
 
				+        "",
			
 
				+        gx
			
 
				+    )
			
 
				+    gf = ggml.ggml_build_forward(gy)
			
 
				+    ggml.ggml_graph_compute_with_ctx(ctx, ctypes.pointer(gf), 1)
			
 
				+
			
 
				+    y = ggml.to_numpy(gy)
			
 
				+    converter_input = {
			
 
				+        "waveform": wav.transpose(0, 1),
			
 
				+        "sample_rate": 16000.,
			
 
				+        "format": -1,
			
 
				+    }
			
 
				+    y_exp = extractor(converter(converter_input)["fbank"].unsqueeze(0), None)[0]
			
 
				+    y_exp = y_exp.numpy() 
			
 
				+
			
 
				+    assert y.shape == y_exp.shape
			
 
				+    assert np.allclose(y_exp, y, atol=4e-3) # reduce? error is from standardization
			
 
				+
			
 
				+
			
 
				+def test_causal_attention_mask(ctx: Ctx):
			
 
				+    x = torch.zeros((5, 10))
			
 
				+    generator = fairseq2.nn.transformer.CausalAttentionMaskGenerator()
			
 
				+    mask_exp = generator(x)
			
 
				+
			
 
				+    gx = ggml.from_numpy(ctx, x)
			
 
				+    gmask = ggml.causal_attention_mask(ctx, gx)
			
 
				+    mask = ggml.to_numpy(gmask)
			
 
				+
			
 
				+    assert mask_exp.shape == (10, 10)
			
 
				+    assert mask.shape == (10, 10)
			
 
				+    assert np.allclose(mask, mask_exp)
			
 
				 
			
 
				 
			
 
				 def test_PositionalEmbedding_forward(ctx: Ctx, g_model: c_void_p) -> None:
			
@@ -443,3 +589,71 @@ def test_t2tt(ctx: Ctx, g_model: c_void_p):
 
				         # The score error is big, this may negatively impact the beam search.
			
 
				         assert np.allclose(g_step_scores, exp["step_scores"], atol=0.1)
			
 
				 
			
 
				+def test_s2tt(ctx: Ctx, g_model: c_void_p):
			
 
				+    src_audio_wav, _ = torchaudio.load("/private/home/dnn/internal_sc/seamless_communication/ggml/examples/unity/test.wav")
			
 
				+    # translator = load_translator()
			
 
				+    # token_encoder = translator.text_tokenizer.create_encoder(
			
 
				+    #     task="translation"
			
 
				+    # )
			
 
				+    # decoded_audio = {
			
 
				+    #     "waveform": src_audio_wav.t(),
			
 
				+    #     "sample_rate": 16000.,
			
 
				+    #     "format": -1,
			
 
				+    # }
			
 
				+    # src = translator.collate(translator.convert_to_fbank(decoded_audio))["fbank"]
			
 
				+
			
 
				+    # text_out, _ = translator.get_prediction(
			
 
				+    #     translator.model,
			
 
				+    #     translator.text_tokenizer,
			
 
				+    #     translator.unit_tokenizer,
			
 
				+    #     src,
			
 
				+    #     input_modality=Modality.SPEECH,
			
 
				+    #     output_modality=Modality.TEXT,
			
 
				+    #     tgt_lang="cmn",
			
 
				+    # )
			
 
				+
			
 
				+    # tgt_text = str(text_out.sentences[0])
			
 
				+    # assert tgt_text == "大家好 , 世界无主题。"
			
 
				+    # tgt_tokens = text_out.generator_output.results[0][0].seq
			
 
				+    # score = text_out.generator_output.results[0][0].score.item()
			
 
				+
			
 
				+    tgt_tokens = [     3, 256200,  16991, 249346, 249725,    146,  25220, 251069, 249211,
			
 
				+        251148, 253935,      3]
			
 
				+    score = -1.606838583946228
			
 
				+    gx = ggml.from_numpy(ctx, src_audio_wav * 2**15) # Apply scale before sending into ggml!
			
 
				+    ggml.ggml_set_name(gx, b"x")
			
 
				+    gy = ggml.forward(
			
 
				+        "StandardConformerEncoder",
			
 
				+        g_model,
			
 
				+        "speech_encoder",
			
 
				+        gx,
			
 
				+        None,  # TODO support padding mask
			
 
				+    )
			
 
				+    gf = ggml.ggml_build_forward(gy)
			
 
				+    ggml.ggml_graph_compute_with_ctx(ctx, ctypes.pointer(gf), 1)
			
 
				+
			
 
				+    encoder_out = gy
			
 
				+
			
 
				+    job = ggml.SequenceGeneratorJob()
			
 
				+    job.opts.beam_size = 1
			
 
				+    job.opts.min_seq_len = 1
			
 
				+    job.opts.soft_max_seq_len_a = 1
			
 
				+    job.opts.soft_max_seq_len_b = 200
			
 
				+    job.opts.hard_max_seq_len = 20
			
 
				+    job.opts.len_penalty = 1.0
			
 
				+    job.opts.unk_penalty = 0.0
			
 
				+    job.prefix_seq = ggml.from_numpy(ctx, np.array([3, 256200]).astype(np.int32))
			
 
				+    job.opts.normalize_scores = True
			
 
				+    job.pad_idx = 0
			
 
				+    job.unk_idx = 1
			
 
				+    job.bos_idx = 2
			
 
				+    job.eos_idx = 3
			
 
				+
			
 
				+    result = ggml.ggml_tensor()
			
 
				+    
			
 
				+    g_score = ggml.generate_sequence(
			
 
				+        g_model, job, encoder_out, None, ctypes.byref(result)
			
 
				+    )
			
 
				+    tokens = list(ggml.to_numpy(result))
			
 
				+    assert tokens == tgt_tokens
			
 
				+    assert g_score == pytest.approx(score)
			
--- a/ggml/third_party_ggml.py
+++ b/ggml/third_party_ggml.py
@@ -60,7 +60,6 @@ from typing import Type
 
				 from typing import Callable
			
 
				 from typing import Tuple
			
 
				 from typing import Dict
			
 
				-from typing_extensions import Self
			
 
				 from typing import Any
			
 
				 from pathlib import Path
			
 
				 from typing import List, Optional, Sequence, Union