unity.cpp 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168
  1. #include "ggml/ggml.h"
  2. #include "ggml/ggml-alloc.h"
  3. #include "math.h"
  4. #include "model_loader.h"
  5. #include "fairseq2.h"
  6. #include "lib/unity_lib.h"
  7. #include <sndfile.h>
  8. #include <cstdlib>
  9. #include "ggml-alloc.h"
  10. #include <numeric>
  11. #include <algorithm>
  12. struct unity_params {
  13. int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
  14. std::string model = "seamlessM4T_medium.ggml"; // model path
  15. std::string tgt_lang = "eng";
  16. std::vector<std::string> files = {};
  17. bool text = false;
  18. SequenceGeneratorOptions opts = {
  19. /*beam_size*/ 5,
  20. /*min_seq_len*/ 1,
  21. /*soft_max_seq_len_a*/ 1,
  22. /*soft_max_seq_len_b*/ 200,
  23. /*hard_max_seq_len*/ 1000,
  24. /*len_penalty*/ 1.0,
  25. /*unk_penalty*/ 0.0,
  26. /*normalize_scores*/ true,
  27. };
  28. bool verbose = false;
  29. };
  30. void unity_print_usage(int /*argc*/, char ** argv, const unity_params & params) {
  31. fprintf(stderr, "usage: %s [options] file1 file2 ...\n", argv[0]);
  32. fprintf(stderr, "\n");
  33. fprintf(stderr, "options:\n");
  34. fprintf(stderr, " -h, --help show this help message and exit\n");
  35. fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
  36. fprintf(stderr, " -v, --verbose Print out word level confidence score and LID score", params.verbose);
  37. fprintf(stderr, " -m FNAME, --model FNAME\n");
  38. fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
  39. fprintf(stderr, " --text text output\n");
  40. fprintf(stderr, " --beam-size beam size (default: %d)\n", params.opts.beam_size);
  41. fprintf(stderr, "\n");
  42. }
  43. std::string get_next_arg(int& i, int argc, char** argv, const std::string& flag, unity_params& params) {
  44. if (i + 1 < argc && argv[i + 1][0] != '-') {
  45. return argv[++i];
  46. } else {
  47. fprintf(stderr, "error: %s requires one argument.\n", flag.c_str());
  48. unity_print_usage(argc, argv, params);
  49. exit(0);
  50. }
  51. }
  52. bool unity_params_parse(int argc, char ** argv, unity_params & params) {
  53. for (int i = 1; i < argc; i++) {
  54. std::string arg = argv[i];
  55. if (arg == "-h" || arg == "--help") {
  56. unity_print_usage(argc, argv, params);
  57. } else if (arg == "-t" || arg == "--threads") {
  58. params.n_threads = std::stoi(get_next_arg(i, argc, argv, arg, params));
  59. } else if (arg == "-m" || arg == "--model") {
  60. params.model = get_next_arg(i, argc, argv, arg, params);
  61. } else if (arg == "-l" || arg == "--tgt-lang") {
  62. params.tgt_lang = get_next_arg(i, argc, argv, arg, params);
  63. } else if (arg == "--text") {
  64. params.text = true;
  65. } else if (arg == "-b" || arg == "--beam-size") {
  66. params.opts.beam_size = std::stoi(get_next_arg(i, argc, argv, arg, params));
  67. } else if (arg == "-v" || arg == "--verbose") {
  68. params.verbose = true;
  69. }else {
  70. params.files.push_back(std::string(arg));
  71. }
  72. }
  73. return true;
  74. }
  75. int main(int argc, char ** argv) {
  76. unity_params params;
  77. if (unity_params_parse(argc, argv, params) == false) {
  78. return 1;
  79. }
  80. fairseq2_model model;
  81. // load the model
  82. if (load_fairseq2_ggml_file(model, params.model.c_str())) {
  83. fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
  84. return 1;
  85. }
  86. std::string input;
  87. bool interactive = params.files.size() == 0;
  88. auto next_file = params.files.begin();
  89. while (true) {
  90. if (interactive) {
  91. std::cout << "\nEnter audio_path and tgt_lang, separated by space (or 'exit' to quit):\n";
  92. std::getline(std::cin, input);
  93. if (input == "exit") {
  94. break;
  95. }
  96. } else {
  97. if (next_file == params.files.end()) break;
  98. input = *(next_file++);
  99. }
  100. std::istringstream iss(input);
  101. std::string audio_path;
  102. std::string tgt_lang = params.tgt_lang;
  103. iss >> audio_path >> tgt_lang;
  104. if (audio_path == "-") {
  105. audio_path = "/proc/self/fd/0";
  106. }
  107. std::cerr << "Translating (Transcribing) " << audio_path << " to " << tgt_lang << "\n";
  108. SF_INFO info;
  109. SNDFILE* sndfile = sf_open(audio_path.c_str(), SFM_READ, &info);
  110. if (!sndfile) {
  111. std::cerr << "Could not open file\n";
  112. if (interactive) continue;
  113. else return 1;
  114. }
  115. // Load audio input
  116. std::vector<float> data(info.frames * info.channels); // Assume info.channels is always 1
  117. sf_readf_float(sndfile, data.data(), info.frames);
  118. // Reset the ggml_context
  119. // The ctx_size_mb mostly depends of input length and model dim.
  120. int ctx_size_mb = 128;
  121. auto encoder_buf = std::vector<uint8_t>(ctx_size_mb * 1024 * 1024);
  122. auto encoder_fwd_buf = std::vector<uint8_t>(ctx_size_mb * 1024 * 1024);
  123. ggml_allocr* fwd_alloc = ggml_allocr_new(encoder_fwd_buf.data(), encoder_fwd_buf.capacity(), 8);
  124. char result_str[4096];
  125. model.ctx = ctx_from_buffer(encoder_buf);
  126. ggml_set_no_alloc(model.ctx, false);
  127. ggml_tensor* seqs = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, info.frames, info.channels);
  128. ggml_set_no_alloc(model.ctx, true);
  129. Result result = unity_eval(model, data, params.opts, tgt_lang, params.n_threads, ctx_size_mb);
  130. std::string concat_transcription = std::accumulate(std::next(result.transcription.begin()), result.transcription.end(), result.transcription[0],
  131. [](const std::string& a, const std::string& b) {
  132. return a + " " + b;
  133. }
  134. );
  135. if (params.verbose) {
  136. std::cout << "Final transcription: " << concat_transcription << std::endl;
  137. std::cout << std::endl;
  138. std::cout << "Word level confidence score:" << std::endl;
  139. for (size_t i = 0; i < result.transcription.size(); ++i) {
  140. std::cout << "Word: " << result.transcription[i] << " | Score: " << result.word_confidence_scores[i] << std::endl;
  141. }
  142. std::cout << std::endl;
  143. std::cout << "LID scores: " << std::endl;
  144. for (const auto& kv : result.lid_scores) {
  145. std::cout << "Language: " << kv.first << "| Score: " << kv.second << std::endl;
  146. }
  147. } else {
  148. std::cout << concat_transcription << std::endl;
  149. }
  150. }
  151. return 0;
  152. }