1 年之前 · 87f2d59ed7
--- a/.gitignore
+++ b/.gitignore
@@ -21,7 +21,6 @@ dist/
 
				 downloads/
			
 
				 eggs/
			
 
				 .eggs/
			
 
				-lib/
			
 
				 lib64/
			
 
				 parts/
			
 
				 sdist/
			
--- a/ggml/examples/unity/CMakeLists.txt
+++ b/ggml/examples/unity/CMakeLists.txt
@@ -7,13 +7,24 @@ target_sources(fairseq2_cpp
 
				         fairseq2.cpp
			
 
				         model_loader.cpp
			
 
				 )
			
 
				+add_library(unity_lib)
			
 
				+target_include_directories(unity_lib PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
			
 
				+target_link_libraries(unity_lib PRIVATE ggml kaldi-native-fbank fairseq2_cpp)
			
 
				+target_sources(unity_lib
			
 
				+    PRIVATE
			
 
				+        lib/unity_lib.h
			
 
				+        lib/unity_lib.cpp
			
 
				+)
			
 
				+
			
 
				 add_executable(unity unity.cpp)
			
 
				 find_package(PkgConfig REQUIRED)
			
 
				 pkg_check_modules(SNDFILE REQUIRED sndfile)
			
 
				-target_include_directories(unity PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../.. ${SNDFILE_INCLUDE_DIRS})
			
 
				-target_link_libraries(unity PRIVATE ggml ${SNDFILE_LIBRARIES})
			
 
				+target_include_directories(unity PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} ${SNDFILE_INCLUDE_DIRS})
			
 
				+target_link_libraries(unity PRIVATE ggml unity_lib ${SNDFILE_LIBRARIES})
			
 
				 target_sources(unity
			
 
				     PRIVATE
			
 
				         fairseq2.cpp
			
 
				         model_loader.cpp
			
 
				+        lib/unity_lib.h
			
 
				+        lib/unity_lib.cpp
			
 
				 )
			
--- a/ggml/examples/unity/fairseq2.cpp
+++ b/ggml/examples/unity/fairseq2.cpp
@@ -11,6 +11,8 @@
 
				 #include "ggml.h"
			
 
				 #include "ggml-alloc.h"
			
 
				 
			
 
				+#include <numeric> 
			
 
				+
			
 
				 ggml_tensor* ggml_detach(ggml_tensor* a) {
			
 
				     a->op = GGML_OP_NONE;
			
 
				     std::fill(a->src, a->src + GGML_MAX_SRC, nullptr);
			
@@ -1166,15 +1168,18 @@ ggml_tensor* ggml_expand_2d(ggml_context* ctx, ggml_tensor* x, int64_t ne0, int6
 
				     return y;
			
 
				 }
			
 
				 
			
 
				-extern "C" void _bootstrap_seqs_and_scores(
			
 
				+void _bootstrap_seqs_and_scores( 
			
 
				     fairseq2_model& model,
			
 
				     const SequenceGeneratorJob& job,
			
 
				     ggml_tensor* full_seqs,
			
 
				     ggml_tensor* scores,
			
 
				     ggml_tensor* encoder_output,
			
 
				     ggml_tensor* encoder_padding_mask,
			
 
				-    int n_threads
			
 
				+    ggml_tensor* lid_scores,
			
 
				+    int n_threads,
			
 
				+    const std::vector<int>& lang_ids
			
 
				 ) {
			
 
				+    // Returns LID score map
			
 
				     int prefix_seq_len = job.prefix_seq->ne[0];
			
 
				     int max_seq_len = scores->ne[0];
			
 
				     int beam_size = scores->ne[1];
			
@@ -1210,12 +1215,32 @@ extern "C" void _bootstrap_seqs_and_scores(
 
				     ggml_tensor* lprobs = ggml_log_softmax(ctx, ggml_slice(ctx, logits, 1, 0, 1));
			
 
				 
			
 
				     ggml_cgraph gf = ggml_build_forward(lprobs);
			
 
				-    ggml_graph_compute_with_ctx(ctx, &gf, 1);
			
 
				+    ggml_graph_compute_with_ctx(ctx, &gf, n_threads);
			
 
				+
			
 
				+    full_seqs->type = GGML_TYPE_I32;
			
 
				+    job.prefix_seq->type = GGML_TYPE_I32;
			
 
				+    // For LID
			
 
				+    for (size_t i = 0; i < lang_ids.size(); ++i) {
			
 
				+        ggml_set_f32_1d(lid_scores, i, std::exp(ggml_get_f32_1d(lprobs, lang_ids[i]))); 
			
 
				+    }
			
 
				 
			
 
				     // Fetch scores of next steps from "lprobs"
			
 
				     float p_score = 0;
			
 
				     for (int i = 1; i < prefix_seq_len; ++i) {
			
 
				-        int p = ggml_get_i32_1d(job.prefix_seq, i);
			
 
				+        int p;
			
 
				+        if (ggml_get_i32_1d(job.prefix_seq, i) == model.vocab.token_to_id["<unk>"]) {
			
 
				+            // If tgt_lang is unk, use the most probable lang tag predicted by model
			
 
				+            int max_value = std::numeric_limits<float>::min();
			
 
				+            for (int j = 0; j < lang_ids.size(); j++) {
			
 
				+                if(ggml_get_f32_1d(lprobs, lang_ids[j]) > max_value) {
			
 
				+                    max_value = ggml_get_f32_1d(lprobs, lang_ids[j]);
			
 
				+                    p = lang_ids[j];
			
 
				+                }
			
 
				+            }
			
 
				+        } else {
			
 
				+            p = ggml_get_i32_1d(job.prefix_seq, i);
			
 
				+        }
			
 
				+        
			
 
				         p_score += ggml_get_f32_1d(lprobs, i * vocab_size + p);
			
 
				         for (int b = 0; b < beam_size; ++b) {
			
 
				             // scores: (N, S)
			
@@ -1296,6 +1321,7 @@ void _finalize_hypothesis(
 
				     float eos_score,
			
 
				     ggml_tensor* seqs, // (beam_size, seq_len)
			
 
				     ggml_tensor* scores, // (beam_size, seq_len)
			
 
				+    ggml_tensor* lid_scores,
			
 
				     Hypothesis* hypothesis
			
 
				 ) {
			
 
				     ggml_tensor* seq = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, step_nr + 2);
			
@@ -1323,6 +1349,7 @@ void _finalize_hypothesis(
 
				         // Skip first EOS since it is always 0 and skews normalization.
			
 
				         eos_score /= (float)std::pow((step_nr + 1), job.opts.len_penalty);
			
 
				     hypothesis->score = eos_score;
			
 
				+    hypothesis->lid_scores = lid_scores;
			
 
				 }
			
 
				 
			
 
				 // Uses ggml_context to store any object.
			
@@ -1346,6 +1373,7 @@ ggml_allocr* new_arena_allocr(std::vector<uint8_t>& buffer) {
 
				 
			
 
				 /// Generates a translation for a single sequence
			
 
				 /// The results Hypothesis are written inside `result_ctx`.
			
 
				+/// If <unk> is set as lang_tok, sequence generator first predicts a lang_tok and use it for subsequent decoding. 
			
 
				 extern "C" Hypothesis* generate_sequence(
			
 
				     fairseq2_model& model,
			
 
				     const SequenceGeneratorJob& job,
			
@@ -1371,6 +1399,14 @@ extern "C" Hypothesis* generate_sequence(
 
				     };
			
 
				     ggml_allocr* step_alloc = new_arena_allocr(local_bufs[3]);
			
 
				 
			
 
				+    std::vector<int> lang_ids;
			
 
				+    for (const auto& kv : model.vocab.token_to_id) {
			
 
				+        if (kv.first.substr(0, 2) == "__" && kv.first.substr(kv.first.size() - 2) == "__") {
			
 
				+            lang_ids.push_back(kv.second);
			
 
				+        }
			
 
				+    }
			
 
				+    std::sort(lang_ids.begin(), lang_ids.end());
			
 
				+
			
 
				     ggml_tensor* embed = model.tensors["text_decoder_frontend.embed.weight"];
			
 
				     size_t vocab_size = embed->ne[1];
			
 
				     std::size_t beam_size = job.opts.beam_size;
			
@@ -1400,10 +1436,8 @@ extern "C" Hypothesis* generate_sequence(
 
				     ggml_tensor* scores = ggml_new_tensor_2d(search_ctx, GGML_TYPE_F32, max_seq_len, beam_size);
			
 
				     ggml_set_name(scores, "scores_0");
			
 
				     ggml_set_f32(scores, 0.0);
			
 
				-
			
 
				     int prefix_seq_len = job.prefix_seq->ne[0];
			
 
				     int start_step = prefix_seq_len - 1;
			
 
				-
			
 
				     ggml_context* prev_step_ctx = ctx_from_buffer(local_bufs[(start_step - 1) % 2]);
			
 
				     ggml_context* step_ctx = ctx_from_buffer(local_bufs[start_step % 2]);
			
 
				     GGML_ASSERT(step_ctx != search_ctx);
			
@@ -1411,8 +1445,9 @@ extern "C" Hypothesis* generate_sequence(
 
				     model.ctx = prev_step_ctx;
			
 
				     // search_ctx because we need encoder_decoder_attn.k_cache to survive for the full search
			
 
				     model.kv_cache_ctx = search_ctx;
			
 
				+    ggml_tensor* lid_scores = ggml_new_tensor_1d(result_ctx, GGML_TYPE_F32, lang_ids.size());
			
 
				     _bootstrap_seqs_and_scores(
			
 
				-        model, job, seqs, scores, encoder_output, encoder_padding_mask, n_threads
			
 
				+        model, job, seqs, scores, encoder_output, encoder_padding_mask, lid_scores, n_threads, lang_ids
			
 
				     );
			
 
				 
			
 
				     // Holds the indices of beams (a beam can occur more than once) that we
			
@@ -1431,8 +1466,26 @@ extern "C" Hypothesis* generate_sequence(
 
				     for (int step_nr = start_step; step_nr < max_seq_len - 1; ++step_nr) {
			
 
				         model.ctx = step_ctx;
			
 
				         ggml_set_no_alloc(step_ctx, true); // Use allocr for the model forward pass
			
 
				+        float max_lprob;
			
 
				+        int p;
			
 
				+        if (step_nr == start_step) {
			
 
				+            // Find the most probable lang_tok and assign it to all beams, when prefix_seq[1] is <unk>
			
 
				+            if (ggml_get_i32_1d(job.prefix_seq, 1) == model.vocab.token_to_id["<unk>"]) {
			
 
				+                float max_lprob = std::numeric_limits<float>::min();
			
 
				+                for(int j = 0; j < lang_ids.size(); j++) {
			
 
				+                    auto val = ggml_get_f32_1d(lid_scores, j);
			
 
				+                    if (val > max_lprob) {
			
 
				+                        max_lprob = val;
			
 
				+                        p = lang_ids[j];
			
 
				+                    }
			
 
				+                }
			
 
				+                for (int k = 0; k < beam_size; k++) {
			
 
				+                    ggml_set_i32_1d(seqs, k * vocab_size + step_nr, p);
			
 
				+                }
			
 
				+            } 
			
 
				+        }
			
 
				         ggml_tensor* prev_token = ggml_slice(step_ctx, seqs, 0, step_nr, step_nr + 1);
			
 
				-
			
 
				+        
			
 
				         ggml_tensor* decoder_input = TransformerEmbeddingFrontend_forward(model, "text_decoder_frontend", prev_token);
			
 
				         ggml_tensor* decoder_output = StandardTransformerDecoder_forward(
			
 
				             model,
			
@@ -1454,7 +1507,7 @@ extern "C" Hypothesis* generate_sequence(
 
				         ggml_cgraph gf = ggml_build_forward(lprobs);
			
 
				         size_t fwd_mem = ggml_allocr_alloc_graph(step_alloc, &gf);
			
 
				         GGML_UNUSED(fwd_mem);
			
 
				-        ggml_graph_compute_with_ctx(step_ctx, &gf, 1);
			
 
				+        ggml_graph_compute_with_ctx(step_ctx, &gf, n_threads);
			
 
				         ggml_detach(lprobs);
			
 
				         ggml_allocr_reset(step_alloc);
			
 
				 #if DEBUG_MEM_USAGE
			
@@ -1500,7 +1553,7 @@ extern "C" Hypothesis* generate_sequence(
 
				             bool eos = token == job.eos_idx;
			
 
				             eos &= tok_score != -INFINITY;
			
 
				             if (eos) {
			
 
				-                _finalize_hypothesis(job, result_ctx, step_nr, beam, token, tok_score, seqs, scores, finished_searches++);
			
 
				+                _finalize_hypothesis(job, result_ctx, step_nr, beam, token, tok_score, seqs, scores, lid_scores, finished_searches++);
			
 
				                 if (finished_searches == finished_searches_end)
			
 
				                     goto end_of_beam_search;
			
 
				                 continue;
			
@@ -1521,7 +1574,7 @@ extern "C" Hypothesis* generate_sequence(
 
				         ggml_cgraph gf_reorder = ggml_build_forward(new_seqs);
			
 
				         ggml_build_forward_expand(&gf_reorder, new_scores);
			
 
				         reorder_kv_cache(model, step_ctx, &gf_reorder, beam_indices);
			
 
				-        ggml_graph_compute_with_ctx(step_ctx, &gf_reorder, 1);
			
 
				+        ggml_graph_compute_with_ctx(step_ctx, &gf_reorder, n_threads);
			
 
				         seqs = ggml_detach(new_seqs);
			
 
				         scores = ggml_detach(new_scores);
			
 
				 
			
@@ -1729,6 +1782,7 @@ extern "C" void fairseq2_spm_tokenize(fairseq2_model* model, const char* text, g
 
				     spm.tokenize(std::string(text), out);
			
 
				 }
			
 
				 
			
 
				+
			
 
				 extern "C" std::size_t fairseq2_spm_detokenize(fairseq2_model* model, ggml_tensor* tokens, char* out) {
			
 
				     int eos_idx = model->vocab.token_to_id["</s>"];
			
 
				     int sent_len = tokens->ne[0];
			
@@ -1750,3 +1804,52 @@ extern "C" std::size_t fairseq2_spm_detokenize(fairseq2_model* model, ggml_tenso
 
				     *out = '0';
			
 
				     return written;
			
 
				 }
			
 
				+
			
 
				+
			
 
				+// TODO: Unify with the above?
			
 
				+std::pair<std::vector<std::string>, std::vector<float>> fairseq2_spm_detokenize(fairseq2_model* model, ggml_tensor* tokens, ggml_tensor* scores, char* out) {
			
 
				+    int eos_idx = model->vocab.token_to_id["</s>"];
			
 
				+    int sent_len = tokens->ne[0];
			
 
				+    std::size_t written = 0;
			
 
				+    std::vector<float> word_scores;
			
 
				+    std::vector<float> subword_scores;
			
 
				+    std::vector<std::string> result_text;
			
 
				+    std::string curr_token = "";
			
 
				+    for (int i = 0; i < sent_len; ++i) {
			
 
				+        int id = ggml_get_i32_1d(tokens, i);
			
 
				+        // Don't print the EOS token but only if it appear at the end.
			
 
				+        if (i == sent_len - 1 && eos_idx == id) break;
			
 
				+
			
 
				+        std::string token = model->vocab.id_to_token.at(id).text;
			
 
				+        float score = ggml_get_f32_1d(scores, i+2); // 2 is prefix size
			
 
				+        if(token[0] == ' ') {
			
 
				+            // reset word score
			
 
				+            if(subword_scores.size() > 0) {
			
 
				+                float avg = std::accumulate(subword_scores.begin(), subword_scores.end(), 0.0f) / subword_scores.size();
			
 
				+                word_scores.push_back(avg);
			
 
				+                subword_scores.clear();
			
 
				+                result_text.push_back(curr_token);
			
 
				+            }
			
 
				+            curr_token = token.substr(1);
			
 
				+        } else {
			
 
				+            curr_token += token;
			
 
				+        }
			
 
				+        subword_scores.push_back(std::exp(score));
			
 
				+        
			
 
				+        // Skip the first space outputted.
			
 
				+        auto begin = token.begin();
			
 
				+        if (i == 0 && token.size() > 0 && token[0] == ' ') begin += 1;
			
 
				+        std::copy(begin, token.end(), out);
			
 
				+        std::size_t n = token.end() - begin;
			
 
				+        written += n;
			
 
				+        out += n;
			
 
				+        
			
 
				+    }
			
 
				+    if(subword_scores.size() > 0) {
			
 
				+        word_scores.push_back(*std::min_element(subword_scores.begin(), subword_scores.end()));
			
 
				+        subword_scores.clear();
			
 
				+        result_text.push_back(curr_token);
			
 
				+    }
			
 
				+    *out = '0';
			
 
				+    return std::make_pair(result_text, word_scores);
			
 
				+}
			
--- a/ggml/examples/unity/fairseq2.h
+++ b/ggml/examples/unity/fairseq2.h
@@ -299,6 +299,9 @@ struct Hypothesis {
 
				 
			
 
				     /// The score of each individual sequence step.
			
 
				     ggml_tensor* step_scores;
			
 
				+
			
 
				+    /// The score of each lang tok at first decoding step, serving as LID 
			
 
				+    ggml_tensor* lid_scores;
			
 
				 };
			
 
				 
			
 
				 
			
@@ -313,3 +316,5 @@ extern "C" Hypothesis* generate_sequence(
 
				 
			
 
				 extern "C" void fairseq2_spm_tokenize(fairseq2_model* model, const char* text, ggml_tensor& out);
			
 
				 extern "C" std::size_t fairseq2_spm_detokenize(fairseq2_model* model, ggml_tensor* tokens, char* out);
			
 
				+std::pair<std::vector<std::string>, std::vector<float>> fairseq2_spm_detokenize(fairseq2_model* model, ggml_tensor* tokens, ggml_tensor* scores, char* out);
			
 
				+
			
--- a/ggml/examples/unity/lib/unity_lib.cpp
+++ b/ggml/examples/unity/lib/unity_lib.cpp
@@ -0,0 +1,112 @@
 
				+#include "unity_lib.h"
			
 
				+#include <algorithm>
			
 
				+
			
 
				+
			
 
				+struct ggml_cgraph * unity_speech_encoder(
			
 
				+        fairseq2_model& model,
			
 
				+        struct ggml_tensor * speech_input) {
			
 
				+    ggml_context* ctx0 = model.ctx;
			
 
				+    ggml_cgraph* gf = ggml_new_graph(ctx0);
			
 
				+    ggml_tensor* seqs = StandardConformerEncoder_forward(model, "speech_encoder", speech_input, nullptr);
			
 
				+    seqs = ggml_dup(model.ctx, seqs);
			
 
				+    ggml_build_forward_expand(gf, seqs);
			
 
				+    return gf;
			
 
				+}
			
 
				+
			
 
				+Hypothesis* unity_decode(
			
 
				+        fairseq2_model& model,
			
 
				+        const SequenceGeneratorOptions& opts,
			
 
				+        int tgt_lang_idx,
			
 
				+        ggml_tensor* encoder_output,
			
 
				+        int n_threads
			
 
				+) {
			
 
				+    SequenceGeneratorJob job = {
			
 
				+        opts,
			
 
				+        /*prefix_seq*/ nullptr,
			
 
				+        /*pad_idx*/model.vocab.token_to_id["<pad>"],
			
 
				+        /*unk_idx*/model.vocab.token_to_id["<unk>"],
			
 
				+        /*bos_idx*/model.vocab.token_to_id["<s>"],
			
 
				+        /*eos_idx*/model.vocab.token_to_id["</s>"],
			
 
				+        /*num_threads*/n_threads,
			
 
				+    };
			
 
				+    FORCE_ALLOC(prefix_seq, model.ctx, ggml_new_tensor_1d(model.ctx, GGML_TYPE_I32, 2));
			
 
				+    ((int *)prefix_seq->data)[0]  = job.eos_idx;
			
 
				+    ((int *)prefix_seq->data)[1]  = tgt_lang_idx;
			
 
				+    job.prefix_seq = prefix_seq;
			
 
				+    return generate_sequence(model, job, encoder_output, nullptr, model.ctx, n_threads);
			
 
				+}
			
 
				+
			
 
				+extern "C" fairseq2_model unity_init_model(const char* model_path) {
			
 
				+    fairseq2_model model;
			
 
				+    load_fairseq2_ggml_file(model, model_path);
			
 
				+    return model;
			
 
				+}
			
 
				+
			
 
				+//  struct as return - transcription, CE score, LID 
			
 
				+extern "C" Result unity_eval(fairseq2_model model, std::vector<float> data, SequenceGeneratorOptions opts, std::string tgt_lang, int n_threads, int memory_mb) {
			
 
				+    Result result;
			
 
				+    // The ctx_size_mb mostly depends of input length and model dim.
			
 
				+    int ctx_size_mb = 128;
			
 
				+    auto encoder_buf = std::vector<uint8_t>(ctx_size_mb * 1024 * 1024);
			
 
				+    auto encoder_fwd_buf = std::vector<uint8_t>(ctx_size_mb * 1024 * 1024);
			
 
				+    ggml_allocr* fwd_alloc = ggml_allocr_new(encoder_fwd_buf.data(), encoder_fwd_buf.capacity(), 8);
			
 
				+    char result_str[4096];
			
 
				+    int tgt_lang_idx;
			
 
				+    if (tgt_lang == "unk") {
			
 
				+        tgt_lang_idx = model.vocab.token_to_id["<unk>"];
			
 
				+    } else {
			
 
				+        auto tgt_lang_ptr = model.vocab.token_to_id.find("__" + tgt_lang + "__"); 
			
 
				+        if (tgt_lang_ptr == model.vocab.token_to_id.end()) {
			
 
				+            std::cerr << "Unknown language " << tgt_lang << "\n";
			
 
				+            result.err = 1;
			
 
				+            return result;
			
 
				+        }
			
 
				+        tgt_lang_idx = tgt_lang_ptr->second;
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+    // Reset the ggml_context
			
 
				+    model.ctx = ctx_from_buffer(encoder_buf);
			
 
				+    ggml_set_no_alloc(model.ctx, false);
			
 
				+    struct ggml_tensor * seqs = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, data.size(), 1);
			
 
				+    memcpy(seqs->data, data.data(), data.size() * sizeof(float));
			
 
				+    ggml_set_no_alloc(model.ctx, true);
			
 
				+
			
 
				+    // Audio encoder
			
 
				+    ggml_cgraph* gf = unity_speech_encoder(model, seqs);
			
 
				+    ggml_allocr_alloc_graph(fwd_alloc, gf);
			
 
				+    ggml_graph_compute_with_ctx(model.ctx, gf, n_threads);
			
 
				+    // encoder_output is valid until we call `ggml_allocr_reset(fwd_alloc)`
			
 
				+    ggml_tensor* encoder_output = gf->nodes[gf->n_nodes - 1];
			
 
				+
			
 
				+    // Beam search decoding
			
 
				+    const Hypothesis* hypo = unity_decode(model, opts, tgt_lang_idx, encoder_output, n_threads);
			
 
				+
			
 
				+    // Drop language and bos token.
			
 
				+    ggml_tensor* tokens = ggml_slice(model.ctx, hypo[0].seq, 0, 2, 0);
			
 
				+
			
 
				+    // Collect result string
			
 
				+    std::pair<std::vector<std::string>, std::vector<float>> p = fairseq2_spm_detokenize(&model, tokens, hypo[0].step_scores, (char*)&result_str);
			
 
				+    std::vector<std::string> result_tokens = p.first;
			
 
				+    std::vector<float> word_scores = p.second;
			
 
				+
			
 
				+    std::unordered_map<std::string, float> lid_scores;
			
 
				+    std::vector<int> lang_ids;
			
 
				+    for (const auto& kv : model.vocab.token_to_id) {
			
 
				+        if (kv.first.substr(0, 2) == "__" && kv.first.substr(kv.first.size() - 2) == "__") {
			
 
				+            lang_ids.push_back(kv.second);
			
 
				+        }
			
 
				+    }
			
 
				+    std::sort(lang_ids.begin(), lang_ids.end());
			
 
				+    for (size_t i = 0; i < lang_ids.size(); ++i) {
			
 
				+        lid_scores[model.vocab.id_to_token[lang_ids[i]].text] = ggml_get_f32_1d(hypo[0].lid_scores, i); 
			
 
				+    }
			
 
				+    
			
 
				+    result.transcription = result_tokens;
			
 
				+    result.word_confidence_scores = word_scores;
			
 
				+    result.lid_scores = lid_scores;
			
 
				+    result.err = 0;
			
 
				+    ggml_free(model.ctx);
			
 
				+    ggml_allocr_reset(fwd_alloc);
			
 
				+    return result;
			
 
				+}
			
--- a/ggml/examples/unity/lib/unity_lib.h
+++ b/ggml/examples/unity/lib/unity_lib.h
@@ -0,0 +1,41 @@
 
				+#include "ggml/ggml.h"
			
 
				+#include "ggml/ggml-alloc.h"
			
 
				+
			
 
				+#include "math.h"
			
 
				+#include "model_loader.h"
			
 
				+#include "fairseq2.h"
			
 
				+
			
 
				+#include <thread>
			
 
				+#include <cassert>
			
 
				+#include <cmath>
			
 
				+#include <cstdio>
			
 
				+#include <cstring>
			
 
				+#include <fstream>
			
 
				+#include <map>
			
 
				+#include <string>
			
 
				+#include <vector>
			
 
				+#include <iostream>
			
 
				+#include <cstdlib>
			
 
				+
			
 
				+struct Result {
			
 
				+    std::vector<std::string> transcription;
			
 
				+    std::vector<float> word_confidence_scores;
			
 
				+    std::unordered_map<std::string, float> lid_scores;
			
 
				+    int err;
			
 
				+};
			
 
				+
			
 
				+struct ggml_cgraph * unity_speech_encoder(
			
 
				+    fairseq2_model& model,
			
 
				+    struct ggml_tensor * speech_input);
			
 
				+
			
 
				+Hypothesis* unity_decode(
			
 
				+        fairseq2_model& model,
			
 
				+        const SequenceGeneratorOptions& opts,
			
 
				+        int tgt_lang_idx,
			
 
				+        ggml_tensor* encoder_output,
			
 
				+        int n_threads
			
 
				+);
			
 
				+
			
 
				+extern "C" fairseq2_model unity_init_model(const char* model_path);
			
 
				+
			
 
				+extern "C" Result unity_eval(fairseq2_model model, std::vector<float> data, SequenceGeneratorOptions opts, std::string tgt_lang, int n_threads, int memory_gb);
			
--- a/ggml/examples/unity/unity.cpp
+++ b/ggml/examples/unity/unity.cpp
@@ -4,20 +4,12 @@
 
				 #include "math.h"
			
 
				 #include "model_loader.h"
			
 
				 #include "fairseq2.h"
			
 
				-
			
 
				-#include <thread>
			
 
				-#include <cassert>
			
 
				-#include <cmath>
			
 
				-#include <cstdio>
			
 
				-#include <cstring>
			
 
				-#include <fstream>
			
 
				-#include <map>
			
 
				-#include <string>
			
 
				-#include <vector>
			
 
				-#include <iostream>
			
 
				+#include "lib/unity_lib.h"
			
 
				 #include <sndfile.h>
			
 
				 #include <cstdlib>
			
 
				 #include "ggml-alloc.h"
			
 
				+#include <numeric>
			
 
				+#include <algorithm>
			
 
				 
			
 
				 struct unity_params {
			
 
				     int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
			
@@ -35,6 +27,7 @@ struct unity_params {
 
				         /*unk_penalty*/ 0.0,
			
 
				         /*normalize_scores*/ true,
			
 
				     };
			
 
				+    bool verbose = false;
			
 
				 };
			
 
				 
			
 
				 
			
@@ -44,6 +37,7 @@ void unity_print_usage(int /*argc*/, char ** argv, const unity_params & params)
 
				     fprintf(stderr, "options:\n");
			
 
				     fprintf(stderr, "  -h, --help            show this help message and exit\n");
			
 
				     fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
			
 
				+    fprintf(stderr, "  -v, --verbose     Print out word level confidence score and LID score", params.verbose);
			
 
				     fprintf(stderr, "  -m FNAME, --model FNAME\n");
			
 
				     fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
			
 
				     fprintf(stderr, "  --text                text output\n");
			
@@ -77,48 +71,16 @@ bool unity_params_parse(int argc, char ** argv, unity_params & params) {
 
				             params.text = true;
			
 
				         } else if (arg == "-b" || arg == "--beam-size") {
			
 
				             params.opts.beam_size = std::stoi(get_next_arg(i, argc, argv, arg, params));
			
 
				-        } else {
			
 
				+        } else if (arg == "-v" || arg == "--verbose") {
			
 
				+            params.verbose = true;
			
 
				+        
			
 
				+        }else {
			
 
				             params.files.push_back(std::string(arg));
			
 
				         }
			
 
				     }
			
 
				     return true;
			
 
				 }
			
 
				 
			
 
				-struct ggml_cgraph * unity_speech_encoder(
			
 
				-        fairseq2_model& model,
			
 
				-        struct ggml_tensor * speech_input) {
			
 
				-    ggml_context* ctx0 = model.ctx;
			
 
				-    ggml_cgraph* gf = ggml_new_graph(ctx0);
			
 
				-    ggml_tensor* seqs = StandardConformerEncoder_forward(model, "speech_encoder", speech_input, nullptr);
			
 
				-    seqs = ggml_dup(model.ctx, seqs);
			
 
				-    ggml_build_forward_expand(gf, seqs);
			
 
				-    return gf;
			
 
				-}
			
 
				-
			
 
				-
			
 
				-Hypothesis* unity_decode(
			
 
				-        fairseq2_model& model,
			
 
				-        const SequenceGeneratorOptions& opts,
			
 
				-        int tgt_lang_idx,
			
 
				-        ggml_tensor* encoder_output,
			
 
				-        int n_threads
			
 
				-) {
			
 
				-    SequenceGeneratorJob job = {
			
 
				-        opts,
			
 
				-        /*prefix_seq*/ nullptr,
			
 
				-        /*pad_idx*/model.vocab.token_to_id["<pad>"],
			
 
				-        /*unk_idx*/model.vocab.token_to_id["<unk>"],
			
 
				-        /*bos_idx*/model.vocab.token_to_id["<s>"],
			
 
				-        /*eos_idx*/model.vocab.token_to_id["</s>"],
			
 
				-        /*num_threads*/n_threads,
			
 
				-    };
			
 
				-    FORCE_ALLOC(prefix_seq, model.ctx, ggml_new_tensor_1d(model.ctx, GGML_TYPE_I32, 2));
			
 
				-    ((int *)prefix_seq->data)[0]  = job.eos_idx;
			
 
				-    ((int *)prefix_seq->data)[1]  = tgt_lang_idx;
			
 
				-    job.prefix_seq = prefix_seq;
			
 
				-    return generate_sequence(model, job, encoder_output, nullptr, model.ctx, n_threads);
			
 
				-}
			
 
				-
			
 
				 int main(int argc, char ** argv) {
			
 
				 
			
 
				     unity_params params;
			
@@ -135,13 +97,6 @@ int main(int argc, char ** argv) {
 
				         return 1;
			
 
				     }
			
 
				 
			
 
				-    // The ctx_size_mb mostly depends of input length and model dim.
			
 
				-    int ctx_size_mb = 128;
			
 
				-    auto encoder_buf = std::vector<uint8_t>(ctx_size_mb * 1024 * 1024);
			
 
				-    auto encoder_fwd_buf = std::vector<uint8_t>(ctx_size_mb * 1024 * 1024);
			
 
				-    ggml_allocr* fwd_alloc = ggml_allocr_new(encoder_fwd_buf.data(), encoder_fwd_buf.capacity(), 8);
			
 
				-    char result_str[4096];
			
 
				-
			
 
				     std::string input;
			
 
				     bool interactive = params.files.size() == 0;
			
 
				     auto next_file = params.files.begin();
			
@@ -171,42 +126,42 @@ int main(int argc, char ** argv) {
 
				             if (interactive) continue;
			
 
				             else return 1;
			
 
				         }
			
 
				-        auto tgt_lang_ptr = model.vocab.token_to_id.find("__" + tgt_lang + "__");
			
 
				-        if (tgt_lang_ptr == model.vocab.token_to_id.end()) {
			
 
				-            std::cerr << "Unknown language " << tgt_lang << "\n";
			
 
				-            if (interactive) continue;
			
 
				-            else return 2;
			
 
				-        }
			
 
				-        int tgt_lang_idx = tgt_lang_ptr->second;
			
 
				-
			
 
				-
			
 
				+        // Load audio input
			
 
				+        std::vector<float> data(info.frames * info.channels); // Assume info.channels is always 1
			
 
				+        sf_readf_float(sndfile, data.data(), info.frames);
			
 
				         // Reset the ggml_context
			
 
				+        // The ctx_size_mb mostly depends of input length and model dim.
			
 
				+        int ctx_size_mb = 128;
			
 
				+        auto encoder_buf = std::vector<uint8_t>(ctx_size_mb * 1024 * 1024);
			
 
				+        auto encoder_fwd_buf = std::vector<uint8_t>(ctx_size_mb * 1024 * 1024);
			
 
				+        ggml_allocr* fwd_alloc = ggml_allocr_new(encoder_fwd_buf.data(), encoder_fwd_buf.capacity(), 8);
			
 
				+        char result_str[4096];
			
 
				         model.ctx = ctx_from_buffer(encoder_buf);
			
 
				         ggml_set_no_alloc(model.ctx, false);
			
 
				         ggml_tensor* seqs = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, info.frames, info.channels);
			
 
				         ggml_set_no_alloc(model.ctx, true);
			
 
				 
			
 
				-        // Load audio input
			
 
				-        sf_readf_float(sndfile, (float*)seqs->data, info.frames);
			
 
				-
			
 
				-        // Audio encoder
			
 
				-        ggml_cgraph* gf = unity_speech_encoder(model, seqs);
			
 
				-        ggml_allocr_alloc_graph(fwd_alloc, gf);
			
 
				-        ggml_graph_compute_with_ctx(model.ctx, gf, params.n_threads);
			
 
				-        // encoder_output is valid until we call `ggml_allocr_reset(fwd_alloc)`
			
 
				-        ggml_tensor* encoder_output = gf->nodes[gf->n_nodes - 1];
			
 
				-
			
 
				-        // Beam search decoding
			
 
				-        const Hypothesis* result = unity_decode(model, params.opts, tgt_lang_idx, encoder_output, params.n_threads);
			
 
				-    
			
 
				-        // Drop language and bos token.
			
 
				-        ggml_tensor* tokens = ggml_slice(model.ctx, result[0].seq, 0, 2, 0);
			
 
				-
			
 
				-        // Collect result string
			
 
				-        int n = fairseq2_spm_detokenize(&model, tokens, (char*)&result_str);
			
 
				-        std::cout << std::string((char*)&result_str, n) << std::endl;
			
 
				-        ggml_free(model.ctx);
			
 
				-        ggml_allocr_reset(fwd_alloc);
			
 
				+        Result result = unity_eval(model, data, params.opts, tgt_lang, params.n_threads, ctx_size_mb);
			
 
				+        std::string concat_transcription = std::accumulate(std::next(result.transcription.begin()), result.transcription.end(), result.transcription[0],
			
 
				+            [](const std::string& a, const std::string& b) {
			
 
				+                return a + " " + b;
			
 
				+            }
			
 
				+        );
			
 
				+        if (params.verbose) {
			
 
				+            std::cout << "Final transcription: " << concat_transcription << std::endl;
			
 
				+            std::cout << std::endl;
			
 
				+            std::cout << "Word level confidence score:" << std::endl;
			
 
				+            for (size_t i = 0; i < result.transcription.size(); ++i) {
			
 
				+                std::cout << "Word: " << result.transcription[i] << " | Score: " << result.word_confidence_scores[i] << std::endl;
			
 
				+            }
			
 
				+            std::cout << std::endl;
			
 
				+            std::cout << "LID scores: " << std::endl;
			
 
				+            for (const auto& kv : result.lid_scores) {
			
 
				+                std::cout << "Language: " << kv.first << "| Score: " << kv.second << std::endl;
			
 
				+            }
			
 
				+        } else {
			
 
				+            std::cout << concat_transcription << std::endl;
			
 
				+        }
			
 
				     }
			
 
				 
			
 
				     return 0;