unity.cpp 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221
  1. #include "ggml/ggml.h"
  2. #include "ggml/ggml-alloc.h"
  3. #include "math.h"
  4. #include "model_loader.h"
  5. #include "fairseq2.h"
  6. #include <thread>
  7. #include <cassert>
  8. #include <cmath>
  9. #include <cstdio>
  10. #include <cstring>
  11. #include <fstream>
  12. #include <map>
  13. #include <string>
  14. #include <vector>
  15. #include <iostream>
  16. #include <sndfile.h>
  17. #include <cstdlib>
  18. #include "ggml-alloc.h"
  19. struct unity_params {
  20. int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
  21. std::string model = "seamlessM4T_medium.ggml"; // model path
  22. std::string tgt_lang = "eng";
  23. std::vector<std::string> files = {};
  24. bool text = false;
  25. SequenceGeneratorOptions opts = {
  26. /*beam_size*/ 5,
  27. /*min_seq_len*/ 1,
  28. /*soft_max_seq_len_a*/ 1,
  29. /*soft_max_seq_len_b*/ 200,
  30. /*hard_max_seq_len*/ 1000,
  31. /*len_penalty*/ 1.0,
  32. /*unk_penalty*/ 0.0,
  33. /*normalize_scores*/ true,
  34. /*mem_mb*/ 512,
  35. };
  36. };
  37. void unity_print_usage(int /*argc*/, char ** argv, const unity_params & params) {
  38. fprintf(stderr, "usage: %s [options] file1 file2 ...\n", argv[0]);
  39. fprintf(stderr, "\n");
  40. fprintf(stderr, "options:\n");
  41. fprintf(stderr, " -h, --help show this help message and exit\n");
  42. fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
  43. fprintf(stderr, " -m FNAME, --model FNAME\n");
  44. fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
  45. fprintf(stderr, " --text text output\n");
  46. fprintf(stderr, " --beam-size beam size (default: %d)\n", params.opts.beam_size);
  47. fprintf(stderr, " -M, --mem memory buffer, increase for long inputs (default: %d)\n", params.opts.mem_mb);
  48. fprintf(stderr, "\n");
  49. }
  50. std::string get_next_arg(int& i, int argc, char** argv, const std::string& flag, unity_params& params) {
  51. if (i + 1 < argc && argv[i + 1][0] != '-') {
  52. return argv[++i];
  53. } else {
  54. fprintf(stderr, "error: %s requires one argument.\n", flag.c_str());
  55. unity_print_usage(argc, argv, params);
  56. exit(0);
  57. }
  58. }
  59. bool unity_params_parse(int argc, char ** argv, unity_params & params) {
  60. for (int i = 1; i < argc; i++) {
  61. std::string arg = argv[i];
  62. if (arg == "-h" || arg == "--help") {
  63. unity_print_usage(argc, argv, params);
  64. } else if (arg == "-t" || arg == "--threads") {
  65. params.n_threads = std::stoi(get_next_arg(i, argc, argv, arg, params));
  66. } else if (arg == "-m" || arg == "--model") {
  67. params.model = get_next_arg(i, argc, argv, arg, params);
  68. } else if (arg == "-l" || arg == "--tgt-lang") {
  69. params.tgt_lang = get_next_arg(i, argc, argv, arg, params);
  70. } else if (arg == "--text") {
  71. params.text = true;
  72. } else if (arg == "-b" || arg == "--beam-size") {
  73. params.opts.beam_size = std::stoi(get_next_arg(i, argc, argv, arg, params));
  74. } else if (arg == "-M" || arg == "--mem") {
  75. params.opts.mem_mb = std::stoi(get_next_arg(i, argc, argv, arg, params));
  76. } else {
  77. params.files.push_back(std::string(arg));
  78. }
  79. }
  80. return true;
  81. }
  82. struct ggml_cgraph * unity_speech_encoder(
  83. fairseq2_model& model,
  84. struct ggml_tensor * speech_input) {
  85. ggml_context* ctx0 = model.ctx;
  86. ggml_cgraph* gf = ggml_new_graph(ctx0);
  87. ggml_tensor* seqs = StandardConformerEncoder_forward(model, "speech_encoder", speech_input, nullptr);
  88. seqs = ggml_dup(model.ctx, seqs);
  89. ggml_build_forward_expand(gf, seqs);
  90. return gf;
  91. }
  92. Hypothesis* unity_decode(
  93. fairseq2_model& model,
  94. const SequenceGeneratorOptions& opts,
  95. int tgt_lang_idx,
  96. ggml_tensor* encoder_output,
  97. int n_threads
  98. ) {
  99. SequenceGeneratorJob job = {
  100. opts,
  101. /*prefix_seq*/ nullptr,
  102. /*pad_idx*/model.vocab.token_to_id["<pad>"],
  103. /*unk_idx*/model.vocab.token_to_id["<unk>"],
  104. /*bos_idx*/model.vocab.token_to_id["<s>"],
  105. /*eos_idx*/model.vocab.token_to_id["</s>"],
  106. /*num_threads*/n_threads,
  107. };
  108. FORCE_ALLOC(prefix_seq, model.ctx, ggml_new_tensor_1d(model.ctx, GGML_TYPE_I32, 2));
  109. ((int *)prefix_seq->data)[0] = job.eos_idx;
  110. ((int *)prefix_seq->data)[1] = tgt_lang_idx;
  111. job.prefix_seq = prefix_seq;
  112. return generate_sequence(model, job, encoder_output, nullptr, model.ctx, n_threads);
  113. }
  114. int main(int argc, char ** argv) {
  115. unity_params params;
  116. if (unity_params_parse(argc, argv, params) == false) {
  117. return 1;
  118. }
  119. fairseq2_model model;
  120. // load the model
  121. if (load_fairseq2_ggml_file(model, params.model.c_str())) {
  122. fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
  123. return 1;
  124. }
  125. // The ctx_size_mb mostly depends of input length and model dim.
  126. int ctx_size_mb = params.opts.mem_mb;
  127. auto encoder_buf = std::vector<uint8_t>(8 * 1024 * 1024); // Only tensor metadata goes in there
  128. auto encoder_fwd_buf = std::vector<uint8_t>(ctx_size_mb * 1024 * 1024 / 2);
  129. ggml_allocr* fwd_alloc = ggml_allocr_new(encoder_fwd_buf.data(), encoder_fwd_buf.capacity(), 8);
  130. char result_str[4096];
  131. std::string input;
  132. bool interactive = params.files.size() == 0;
  133. auto next_file = params.files.begin();
  134. while (true) {
  135. if (interactive) {
  136. std::cout << "\nEnter audio_path and tgt_lang, separated by space (or 'exit' to quit):\n";
  137. std::getline(std::cin, input);
  138. if (input == "exit") {
  139. break;
  140. }
  141. } else {
  142. if (next_file == params.files.end()) break;
  143. input = *(next_file++);
  144. }
  145. std::istringstream iss(input);
  146. std::string audio_path;
  147. std::string tgt_lang = params.tgt_lang;
  148. iss >> audio_path >> tgt_lang;
  149. if (audio_path == "-") {
  150. audio_path = "/proc/self/fd/0";
  151. }
  152. std::cerr << "Translating (Transcribing) " << audio_path << " to " << tgt_lang << "\n";
  153. SF_INFO info;
  154. SNDFILE* sndfile = sf_open(audio_path.c_str(), SFM_READ, &info);
  155. if (!sndfile) {
  156. std::cerr << "Could not open file\n";
  157. if (interactive) continue;
  158. else return 1;
  159. }
  160. auto tgt_lang_ptr = model.vocab.token_to_id.find("__" + tgt_lang + "__");
  161. if (tgt_lang_ptr == model.vocab.token_to_id.end()) {
  162. std::cerr << "Unknown language " << tgt_lang << "\n";
  163. if (interactive) continue;
  164. else return 2;
  165. }
  166. int tgt_lang_idx = tgt_lang_ptr->second;
  167. // Reset the ggml_context
  168. model.ctx = ctx_from_buffer(encoder_buf);
  169. ggml_set_no_alloc(model.ctx, true);
  170. GGML_ASSERT(info.samplerate == 16000);
  171. GGML_ASSERT(info.channels == 1);
  172. // stop at 30s. Ideally we should chunk input audio, but this will prevent most obvious OOM.
  173. int n_frames = std::min(info.samplerate * 30, (int)info.frames);
  174. ggml_tensor* seqs = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, n_frames, info.channels);
  175. ggml_allocr_alloc(fwd_alloc, seqs);
  176. // Load audio input
  177. sf_readf_float(sndfile, (float*)seqs->data, n_frames);
  178. // Audio encoder
  179. ggml_cgraph* gf = unity_speech_encoder(model, seqs);
  180. size_t enc_mem_used = ggml_allocr_alloc_graph(fwd_alloc, gf);
  181. ggml_graph_compute_with_ctx(model.ctx, gf, params.n_threads);
  182. // encoder_output is valid until we call `ggml_allocr_reset(fwd_alloc)`
  183. ggml_tensor* encoder_output = gf->nodes[gf->n_nodes - 1];
  184. // Beam search decoding
  185. const Hypothesis* result = unity_decode(model, params.opts, tgt_lang_idx, encoder_output, params.n_threads);
  186. // Drop language and bos token.
  187. ggml_tensor* tokens = ggml_slice(model.ctx, result[0].seq, 0, 2, 0);
  188. // Collect result string
  189. int n = fairseq2_spm_detokenize(&model, tokens, (char*)&result_str);
  190. std::cout << std::string((char*)&result_str, n) << std::endl;
  191. ggml_free(model.ctx);
  192. ggml_allocr_reset(fwd_alloc);
  193. }
  194. return 0;
  195. }