Forráskód Böngészése

unity.cpp API changes (#244)

* No alloc (#250)

* don't pre-allocate kv cache (it needs reordering anyway)

* enable support for more int operations

* fix buffers allocation

* add kv_cache_ctx for enc_dec attn cache

* add lifespan

* use allocr in generate_sequence

* test all layers with allocr

* avoid copy of wav file

* force allocation of kv_cache otherwise buffers are reused

* get_rows for ints

* ggml: pimp up dot graph

* Revert "add lifespan"

This reverts commit 73cf7963ff9a6dcb37b7713910ba81b797ffb743.

* cleanup

* Revert "ggml: pimp up dot graph"

This reverts commit 6bc467133900e9ba8f5cf48710c9249ea7be8aaf.

* less restrictive test

* rename

* LID / CE scores

* address comments

* when tgt_lang==unk, the model chooses it automatically

* drop extern for bootstrap & revert unnecessary changes

* comments

* typo
Ning 1 éve
szülő
commit
87f2d59ed7

+ 0 - 1
.gitignore

@@ -21,7 +21,6 @@ dist/
 downloads/
 eggs/
 .eggs/
-lib/
 lib64/
 parts/
 sdist/

+ 13 - 2
ggml/examples/unity/CMakeLists.txt

@@ -7,13 +7,24 @@ target_sources(fairseq2_cpp
         fairseq2.cpp
         model_loader.cpp
 )
+add_library(unity_lib)
+target_include_directories(unity_lib PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
+target_link_libraries(unity_lib PRIVATE ggml kaldi-native-fbank fairseq2_cpp)
+target_sources(unity_lib
+    PRIVATE
+        lib/unity_lib.h
+        lib/unity_lib.cpp
+)
+
 add_executable(unity unity.cpp)
 find_package(PkgConfig REQUIRED)
 pkg_check_modules(SNDFILE REQUIRED sndfile)
-target_include_directories(unity PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../.. ${SNDFILE_INCLUDE_DIRS})
-target_link_libraries(unity PRIVATE ggml ${SNDFILE_LIBRARIES})
+target_include_directories(unity PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} ${SNDFILE_INCLUDE_DIRS})
+target_link_libraries(unity PRIVATE ggml unity_lib ${SNDFILE_LIBRARIES})
 target_sources(unity
     PRIVATE
         fairseq2.cpp
         model_loader.cpp
+        lib/unity_lib.h
+        lib/unity_lib.cpp
 )

+ 114 - 11
ggml/examples/unity/fairseq2.cpp

@@ -11,6 +11,8 @@
 #include "ggml.h"
 #include "ggml-alloc.h"
 
+#include <numeric> 
+
 ggml_tensor* ggml_detach(ggml_tensor* a) {
     a->op = GGML_OP_NONE;
     std::fill(a->src, a->src + GGML_MAX_SRC, nullptr);
@@ -1166,15 +1168,18 @@ ggml_tensor* ggml_expand_2d(ggml_context* ctx, ggml_tensor* x, int64_t ne0, int6
     return y;
 }
 
-extern "C" void _bootstrap_seqs_and_scores(
+void _bootstrap_seqs_and_scores( 
     fairseq2_model& model,
     const SequenceGeneratorJob& job,
     ggml_tensor* full_seqs,
     ggml_tensor* scores,
     ggml_tensor* encoder_output,
     ggml_tensor* encoder_padding_mask,
-    int n_threads
+    ggml_tensor* lid_scores,
+    int n_threads,
+    const std::vector<int>& lang_ids
 ) {
+    // Returns LID score map
     int prefix_seq_len = job.prefix_seq->ne[0];
     int max_seq_len = scores->ne[0];
     int beam_size = scores->ne[1];
@@ -1210,12 +1215,32 @@ extern "C" void _bootstrap_seqs_and_scores(
     ggml_tensor* lprobs = ggml_log_softmax(ctx, ggml_slice(ctx, logits, 1, 0, 1));
 
     ggml_cgraph gf = ggml_build_forward(lprobs);
-    ggml_graph_compute_with_ctx(ctx, &gf, 1);
+    ggml_graph_compute_with_ctx(ctx, &gf, n_threads);
+
+    full_seqs->type = GGML_TYPE_I32;
+    job.prefix_seq->type = GGML_TYPE_I32;
+    // For LID
+    for (size_t i = 0; i < lang_ids.size(); ++i) {
+        ggml_set_f32_1d(lid_scores, i, std::exp(ggml_get_f32_1d(lprobs, lang_ids[i]))); 
+    }
 
     // Fetch scores of next steps from "lprobs"
     float p_score = 0;
     for (int i = 1; i < prefix_seq_len; ++i) {
-        int p = ggml_get_i32_1d(job.prefix_seq, i);
+        int p;
+        if (ggml_get_i32_1d(job.prefix_seq, i) == model.vocab.token_to_id["<unk>"]) {
+            // If tgt_lang is unk, use the most probable lang tag predicted by model
+            int max_value = std::numeric_limits<float>::min();
+            for (int j = 0; j < lang_ids.size(); j++) {
+                if(ggml_get_f32_1d(lprobs, lang_ids[j]) > max_value) {
+                    max_value = ggml_get_f32_1d(lprobs, lang_ids[j]);
+                    p = lang_ids[j];
+                }
+            }
+        } else {
+            p = ggml_get_i32_1d(job.prefix_seq, i);
+        }
+        
         p_score += ggml_get_f32_1d(lprobs, i * vocab_size + p);
         for (int b = 0; b < beam_size; ++b) {
             // scores: (N, S)
@@ -1296,6 +1321,7 @@ void _finalize_hypothesis(
     float eos_score,
     ggml_tensor* seqs, // (beam_size, seq_len)
     ggml_tensor* scores, // (beam_size, seq_len)
+    ggml_tensor* lid_scores,
     Hypothesis* hypothesis
 ) {
     ggml_tensor* seq = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, step_nr + 2);
@@ -1323,6 +1349,7 @@ void _finalize_hypothesis(
         // Skip first EOS since it is always 0 and skews normalization.
         eos_score /= (float)std::pow((step_nr + 1), job.opts.len_penalty);
     hypothesis->score = eos_score;
+    hypothesis->lid_scores = lid_scores;
 }
 
 // Uses ggml_context to store any object.
@@ -1346,6 +1373,7 @@ ggml_allocr* new_arena_allocr(std::vector<uint8_t>& buffer) {
 
 /// Generates a translation for a single sequence
 /// The results Hypothesis are written inside `result_ctx`.
+/// If <unk> is set as lang_tok, sequence generator first predicts a lang_tok and use it for subsequent decoding. 
 extern "C" Hypothesis* generate_sequence(
     fairseq2_model& model,
     const SequenceGeneratorJob& job,
@@ -1371,6 +1399,14 @@ extern "C" Hypothesis* generate_sequence(
     };
     ggml_allocr* step_alloc = new_arena_allocr(local_bufs[3]);
 
+    std::vector<int> lang_ids;
+    for (const auto& kv : model.vocab.token_to_id) {
+        if (kv.first.substr(0, 2) == "__" && kv.first.substr(kv.first.size() - 2) == "__") {
+            lang_ids.push_back(kv.second);
+        }
+    }
+    std::sort(lang_ids.begin(), lang_ids.end());
+
     ggml_tensor* embed = model.tensors["text_decoder_frontend.embed.weight"];
     size_t vocab_size = embed->ne[1];
     std::size_t beam_size = job.opts.beam_size;
@@ -1400,10 +1436,8 @@ extern "C" Hypothesis* generate_sequence(
     ggml_tensor* scores = ggml_new_tensor_2d(search_ctx, GGML_TYPE_F32, max_seq_len, beam_size);
     ggml_set_name(scores, "scores_0");
     ggml_set_f32(scores, 0.0);
-
     int prefix_seq_len = job.prefix_seq->ne[0];
     int start_step = prefix_seq_len - 1;
-
     ggml_context* prev_step_ctx = ctx_from_buffer(local_bufs[(start_step - 1) % 2]);
     ggml_context* step_ctx = ctx_from_buffer(local_bufs[start_step % 2]);
     GGML_ASSERT(step_ctx != search_ctx);
@@ -1411,8 +1445,9 @@ extern "C" Hypothesis* generate_sequence(
     model.ctx = prev_step_ctx;
     // search_ctx because we need encoder_decoder_attn.k_cache to survive for the full search
     model.kv_cache_ctx = search_ctx;
+    ggml_tensor* lid_scores = ggml_new_tensor_1d(result_ctx, GGML_TYPE_F32, lang_ids.size());
     _bootstrap_seqs_and_scores(
-        model, job, seqs, scores, encoder_output, encoder_padding_mask, n_threads
+        model, job, seqs, scores, encoder_output, encoder_padding_mask, lid_scores, n_threads, lang_ids
     );
 
     // Holds the indices of beams (a beam can occur more than once) that we
@@ -1431,8 +1466,26 @@ extern "C" Hypothesis* generate_sequence(
     for (int step_nr = start_step; step_nr < max_seq_len - 1; ++step_nr) {
         model.ctx = step_ctx;
         ggml_set_no_alloc(step_ctx, true); // Use allocr for the model forward pass
+        float max_lprob;
+        int p;
+        if (step_nr == start_step) {
+            // Find the most probable lang_tok and assign it to all beams, when prefix_seq[1] is <unk>
+            if (ggml_get_i32_1d(job.prefix_seq, 1) == model.vocab.token_to_id["<unk>"]) {
+                float max_lprob = std::numeric_limits<float>::min();
+                for(int j = 0; j < lang_ids.size(); j++) {
+                    auto val = ggml_get_f32_1d(lid_scores, j);
+                    if (val > max_lprob) {
+                        max_lprob = val;
+                        p = lang_ids[j];
+                    }
+                }
+                for (int k = 0; k < beam_size; k++) {
+                    ggml_set_i32_1d(seqs, k * vocab_size + step_nr, p);
+                }
+            } 
+        }
         ggml_tensor* prev_token = ggml_slice(step_ctx, seqs, 0, step_nr, step_nr + 1);
-
+        
         ggml_tensor* decoder_input = TransformerEmbeddingFrontend_forward(model, "text_decoder_frontend", prev_token);
         ggml_tensor* decoder_output = StandardTransformerDecoder_forward(
             model,
@@ -1454,7 +1507,7 @@ extern "C" Hypothesis* generate_sequence(
         ggml_cgraph gf = ggml_build_forward(lprobs);
         size_t fwd_mem = ggml_allocr_alloc_graph(step_alloc, &gf);
         GGML_UNUSED(fwd_mem);
-        ggml_graph_compute_with_ctx(step_ctx, &gf, 1);
+        ggml_graph_compute_with_ctx(step_ctx, &gf, n_threads);
         ggml_detach(lprobs);
         ggml_allocr_reset(step_alloc);
 #if DEBUG_MEM_USAGE
@@ -1500,7 +1553,7 @@ extern "C" Hypothesis* generate_sequence(
             bool eos = token == job.eos_idx;
             eos &= tok_score != -INFINITY;
             if (eos) {
-                _finalize_hypothesis(job, result_ctx, step_nr, beam, token, tok_score, seqs, scores, finished_searches++);
+                _finalize_hypothesis(job, result_ctx, step_nr, beam, token, tok_score, seqs, scores, lid_scores, finished_searches++);
                 if (finished_searches == finished_searches_end)
                     goto end_of_beam_search;
                 continue;
@@ -1521,7 +1574,7 @@ extern "C" Hypothesis* generate_sequence(
         ggml_cgraph gf_reorder = ggml_build_forward(new_seqs);
         ggml_build_forward_expand(&gf_reorder, new_scores);
         reorder_kv_cache(model, step_ctx, &gf_reorder, beam_indices);
-        ggml_graph_compute_with_ctx(step_ctx, &gf_reorder, 1);
+        ggml_graph_compute_with_ctx(step_ctx, &gf_reorder, n_threads);
         seqs = ggml_detach(new_seqs);
         scores = ggml_detach(new_scores);
 
@@ -1729,6 +1782,7 @@ extern "C" void fairseq2_spm_tokenize(fairseq2_model* model, const char* text, g
     spm.tokenize(std::string(text), out);
 }
 
+
 extern "C" std::size_t fairseq2_spm_detokenize(fairseq2_model* model, ggml_tensor* tokens, char* out) {
     int eos_idx = model->vocab.token_to_id["</s>"];
     int sent_len = tokens->ne[0];
@@ -1750,3 +1804,52 @@ extern "C" std::size_t fairseq2_spm_detokenize(fairseq2_model* model, ggml_tenso
     *out = '0';
     return written;
 }
+
+
+// TODO: Unify with the above?
+std::pair<std::vector<std::string>, std::vector<float>> fairseq2_spm_detokenize(fairseq2_model* model, ggml_tensor* tokens, ggml_tensor* scores, char* out) {
+    int eos_idx = model->vocab.token_to_id["</s>"];
+    int sent_len = tokens->ne[0];
+    std::size_t written = 0;
+    std::vector<float> word_scores;
+    std::vector<float> subword_scores;
+    std::vector<std::string> result_text;
+    std::string curr_token = "";
+    for (int i = 0; i < sent_len; ++i) {
+        int id = ggml_get_i32_1d(tokens, i);
+        // Don't print the EOS token but only if it appear at the end.
+        if (i == sent_len - 1 && eos_idx == id) break;
+
+        std::string token = model->vocab.id_to_token.at(id).text;
+        float score = ggml_get_f32_1d(scores, i+2); // 2 is prefix size
+        if(token[0] == ' ') {
+            // reset word score
+            if(subword_scores.size() > 0) {
+                float avg = std::accumulate(subword_scores.begin(), subword_scores.end(), 0.0f) / subword_scores.size();
+                word_scores.push_back(avg);
+                subword_scores.clear();
+                result_text.push_back(curr_token);
+            }
+            curr_token = token.substr(1);
+        } else {
+            curr_token += token;
+        }
+        subword_scores.push_back(std::exp(score));
+        
+        // Skip the first space outputted.
+        auto begin = token.begin();
+        if (i == 0 && token.size() > 0 && token[0] == ' ') begin += 1;
+        std::copy(begin, token.end(), out);
+        std::size_t n = token.end() - begin;
+        written += n;
+        out += n;
+        
+    }
+    if(subword_scores.size() > 0) {
+        word_scores.push_back(*std::min_element(subword_scores.begin(), subword_scores.end()));
+        subword_scores.clear();
+        result_text.push_back(curr_token);
+    }
+    *out = '0';
+    return std::make_pair(result_text, word_scores);
+}

+ 5 - 0
ggml/examples/unity/fairseq2.h

@@ -299,6 +299,9 @@ struct Hypothesis {
 
     /// The score of each individual sequence step.
     ggml_tensor* step_scores;
+
+    /// The score of each lang tok at first decoding step, serving as LID 
+    ggml_tensor* lid_scores;
 };
 
 
@@ -313,3 +316,5 @@ extern "C" Hypothesis* generate_sequence(
 
 extern "C" void fairseq2_spm_tokenize(fairseq2_model* model, const char* text, ggml_tensor& out);
 extern "C" std::size_t fairseq2_spm_detokenize(fairseq2_model* model, ggml_tensor* tokens, char* out);
+std::pair<std::vector<std::string>, std::vector<float>> fairseq2_spm_detokenize(fairseq2_model* model, ggml_tensor* tokens, ggml_tensor* scores, char* out);
+

+ 112 - 0
ggml/examples/unity/lib/unity_lib.cpp

@@ -0,0 +1,112 @@
+#include "unity_lib.h"
+#include <algorithm>
+
+
+struct ggml_cgraph * unity_speech_encoder(
+        fairseq2_model& model,
+        struct ggml_tensor * speech_input) {
+    ggml_context* ctx0 = model.ctx;
+    ggml_cgraph* gf = ggml_new_graph(ctx0);
+    ggml_tensor* seqs = StandardConformerEncoder_forward(model, "speech_encoder", speech_input, nullptr);
+    seqs = ggml_dup(model.ctx, seqs);
+    ggml_build_forward_expand(gf, seqs);
+    return gf;
+}
+
+Hypothesis* unity_decode(
+        fairseq2_model& model,
+        const SequenceGeneratorOptions& opts,
+        int tgt_lang_idx,
+        ggml_tensor* encoder_output,
+        int n_threads
+) {
+    SequenceGeneratorJob job = {
+        opts,
+        /*prefix_seq*/ nullptr,
+        /*pad_idx*/model.vocab.token_to_id["<pad>"],
+        /*unk_idx*/model.vocab.token_to_id["<unk>"],
+        /*bos_idx*/model.vocab.token_to_id["<s>"],
+        /*eos_idx*/model.vocab.token_to_id["</s>"],
+        /*num_threads*/n_threads,
+    };
+    FORCE_ALLOC(prefix_seq, model.ctx, ggml_new_tensor_1d(model.ctx, GGML_TYPE_I32, 2));
+    ((int *)prefix_seq->data)[0]  = job.eos_idx;
+    ((int *)prefix_seq->data)[1]  = tgt_lang_idx;
+    job.prefix_seq = prefix_seq;
+    return generate_sequence(model, job, encoder_output, nullptr, model.ctx, n_threads);
+}
+
+extern "C" fairseq2_model unity_init_model(const char* model_path) {
+    fairseq2_model model;
+    load_fairseq2_ggml_file(model, model_path);
+    return model;
+}
+
+//  struct as return - transcription, CE score, LID 
+extern "C" Result unity_eval(fairseq2_model model, std::vector<float> data, SequenceGeneratorOptions opts, std::string tgt_lang, int n_threads, int memory_mb) {
+    Result result;
+    // The ctx_size_mb mostly depends of input length and model dim.
+    int ctx_size_mb = 128;
+    auto encoder_buf = std::vector<uint8_t>(ctx_size_mb * 1024 * 1024);
+    auto encoder_fwd_buf = std::vector<uint8_t>(ctx_size_mb * 1024 * 1024);
+    ggml_allocr* fwd_alloc = ggml_allocr_new(encoder_fwd_buf.data(), encoder_fwd_buf.capacity(), 8);
+    char result_str[4096];
+    int tgt_lang_idx;
+    if (tgt_lang == "unk") {
+        tgt_lang_idx = model.vocab.token_to_id["<unk>"];
+    } else {
+        auto tgt_lang_ptr = model.vocab.token_to_id.find("__" + tgt_lang + "__"); 
+        if (tgt_lang_ptr == model.vocab.token_to_id.end()) {
+            std::cerr << "Unknown language " << tgt_lang << "\n";
+            result.err = 1;
+            return result;
+        }
+        tgt_lang_idx = tgt_lang_ptr->second;
+    }
+
+
+    // Reset the ggml_context
+    model.ctx = ctx_from_buffer(encoder_buf);
+    ggml_set_no_alloc(model.ctx, false);
+    struct ggml_tensor * seqs = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, data.size(), 1);
+    memcpy(seqs->data, data.data(), data.size() * sizeof(float));
+    ggml_set_no_alloc(model.ctx, true);
+
+    // Audio encoder
+    ggml_cgraph* gf = unity_speech_encoder(model, seqs);
+    ggml_allocr_alloc_graph(fwd_alloc, gf);
+    ggml_graph_compute_with_ctx(model.ctx, gf, n_threads);
+    // encoder_output is valid until we call `ggml_allocr_reset(fwd_alloc)`
+    ggml_tensor* encoder_output = gf->nodes[gf->n_nodes - 1];
+
+    // Beam search decoding
+    const Hypothesis* hypo = unity_decode(model, opts, tgt_lang_idx, encoder_output, n_threads);
+
+    // Drop language and bos token.
+    ggml_tensor* tokens = ggml_slice(model.ctx, hypo[0].seq, 0, 2, 0);
+
+    // Collect result string
+    std::pair<std::vector<std::string>, std::vector<float>> p = fairseq2_spm_detokenize(&model, tokens, hypo[0].step_scores, (char*)&result_str);
+    std::vector<std::string> result_tokens = p.first;
+    std::vector<float> word_scores = p.second;
+
+    std::unordered_map<std::string, float> lid_scores;
+    std::vector<int> lang_ids;
+    for (const auto& kv : model.vocab.token_to_id) {
+        if (kv.first.substr(0, 2) == "__" && kv.first.substr(kv.first.size() - 2) == "__") {
+            lang_ids.push_back(kv.second);
+        }
+    }
+    std::sort(lang_ids.begin(), lang_ids.end());
+    for (size_t i = 0; i < lang_ids.size(); ++i) {
+        lid_scores[model.vocab.id_to_token[lang_ids[i]].text] = ggml_get_f32_1d(hypo[0].lid_scores, i); 
+    }
+    
+    result.transcription = result_tokens;
+    result.word_confidence_scores = word_scores;
+    result.lid_scores = lid_scores;
+    result.err = 0;
+    ggml_free(model.ctx);
+    ggml_allocr_reset(fwd_alloc);
+    return result;
+}

+ 41 - 0
ggml/examples/unity/lib/unity_lib.h

@@ -0,0 +1,41 @@
+#include "ggml/ggml.h"
+#include "ggml/ggml-alloc.h"
+
+#include "math.h"
+#include "model_loader.h"
+#include "fairseq2.h"
+
+#include <thread>
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <map>
+#include <string>
+#include <vector>
+#include <iostream>
+#include <cstdlib>
+
+struct Result {
+    std::vector<std::string> transcription;
+    std::vector<float> word_confidence_scores;
+    std::unordered_map<std::string, float> lid_scores;
+    int err;
+};
+
+struct ggml_cgraph * unity_speech_encoder(
+    fairseq2_model& model,
+    struct ggml_tensor * speech_input);
+
+Hypothesis* unity_decode(
+        fairseq2_model& model,
+        const SequenceGeneratorOptions& opts,
+        int tgt_lang_idx,
+        ggml_tensor* encoder_output,
+        int n_threads
+);
+
+extern "C" fairseq2_model unity_init_model(const char* model_path);
+
+extern "C" Result unity_eval(fairseq2_model model, std::vector<float> data, SequenceGeneratorOptions opts, std::string tgt_lang, int n_threads, int memory_gb);

+ 39 - 84
ggml/examples/unity/unity.cpp

@@ -4,20 +4,12 @@
 #include "math.h"
 #include "model_loader.h"
 #include "fairseq2.h"
-
-#include <thread>
-#include <cassert>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <fstream>
-#include <map>
-#include <string>
-#include <vector>
-#include <iostream>
+#include "lib/unity_lib.h"
 #include <sndfile.h>
 #include <cstdlib>
 #include "ggml-alloc.h"
+#include <numeric>
+#include <algorithm>
 
 struct unity_params {
     int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
@@ -35,6 +27,7 @@ struct unity_params {
         /*unk_penalty*/ 0.0,
         /*normalize_scores*/ true,
     };
+    bool verbose = false;
 };
 
 
@@ -44,6 +37,7 @@ void unity_print_usage(int /*argc*/, char ** argv, const unity_params & params)
     fprintf(stderr, "options:\n");
     fprintf(stderr, "  -h, --help            show this help message and exit\n");
     fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
+    fprintf(stderr, "  -v, --verbose     Print out word level confidence score and LID score", params.verbose);
     fprintf(stderr, "  -m FNAME, --model FNAME\n");
     fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
     fprintf(stderr, "  --text                text output\n");
@@ -77,48 +71,16 @@ bool unity_params_parse(int argc, char ** argv, unity_params & params) {
             params.text = true;
         } else if (arg == "-b" || arg == "--beam-size") {
             params.opts.beam_size = std::stoi(get_next_arg(i, argc, argv, arg, params));
-        } else {
+        } else if (arg == "-v" || arg == "--verbose") {
+            params.verbose = true;
+        
+        }else {
             params.files.push_back(std::string(arg));
         }
     }
     return true;
 }
 
-struct ggml_cgraph * unity_speech_encoder(
-        fairseq2_model& model,
-        struct ggml_tensor * speech_input) {
-    ggml_context* ctx0 = model.ctx;
-    ggml_cgraph* gf = ggml_new_graph(ctx0);
-    ggml_tensor* seqs = StandardConformerEncoder_forward(model, "speech_encoder", speech_input, nullptr);
-    seqs = ggml_dup(model.ctx, seqs);
-    ggml_build_forward_expand(gf, seqs);
-    return gf;
-}
-
-
-Hypothesis* unity_decode(
-        fairseq2_model& model,
-        const SequenceGeneratorOptions& opts,
-        int tgt_lang_idx,
-        ggml_tensor* encoder_output,
-        int n_threads
-) {
-    SequenceGeneratorJob job = {
-        opts,
-        /*prefix_seq*/ nullptr,
-        /*pad_idx*/model.vocab.token_to_id["<pad>"],
-        /*unk_idx*/model.vocab.token_to_id["<unk>"],
-        /*bos_idx*/model.vocab.token_to_id["<s>"],
-        /*eos_idx*/model.vocab.token_to_id["</s>"],
-        /*num_threads*/n_threads,
-    };
-    FORCE_ALLOC(prefix_seq, model.ctx, ggml_new_tensor_1d(model.ctx, GGML_TYPE_I32, 2));
-    ((int *)prefix_seq->data)[0]  = job.eos_idx;
-    ((int *)prefix_seq->data)[1]  = tgt_lang_idx;
-    job.prefix_seq = prefix_seq;
-    return generate_sequence(model, job, encoder_output, nullptr, model.ctx, n_threads);
-}
-
 int main(int argc, char ** argv) {
 
     unity_params params;
@@ -135,13 +97,6 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-    // The ctx_size_mb mostly depends of input length and model dim.
-    int ctx_size_mb = 128;
-    auto encoder_buf = std::vector<uint8_t>(ctx_size_mb * 1024 * 1024);
-    auto encoder_fwd_buf = std::vector<uint8_t>(ctx_size_mb * 1024 * 1024);
-    ggml_allocr* fwd_alloc = ggml_allocr_new(encoder_fwd_buf.data(), encoder_fwd_buf.capacity(), 8);
-    char result_str[4096];
-
     std::string input;
     bool interactive = params.files.size() == 0;
     auto next_file = params.files.begin();
@@ -171,42 +126,42 @@ int main(int argc, char ** argv) {
             if (interactive) continue;
             else return 1;
         }
-        auto tgt_lang_ptr = model.vocab.token_to_id.find("__" + tgt_lang + "__");
-        if (tgt_lang_ptr == model.vocab.token_to_id.end()) {
-            std::cerr << "Unknown language " << tgt_lang << "\n";
-            if (interactive) continue;
-            else return 2;
-        }
-        int tgt_lang_idx = tgt_lang_ptr->second;
-
-
+        // Load audio input
+        std::vector<float> data(info.frames * info.channels); // Assume info.channels is always 1
+        sf_readf_float(sndfile, data.data(), info.frames);
         // Reset the ggml_context
+        // The ctx_size_mb mostly depends of input length and model dim.
+        int ctx_size_mb = 128;
+        auto encoder_buf = std::vector<uint8_t>(ctx_size_mb * 1024 * 1024);
+        auto encoder_fwd_buf = std::vector<uint8_t>(ctx_size_mb * 1024 * 1024);
+        ggml_allocr* fwd_alloc = ggml_allocr_new(encoder_fwd_buf.data(), encoder_fwd_buf.capacity(), 8);
+        char result_str[4096];
         model.ctx = ctx_from_buffer(encoder_buf);
         ggml_set_no_alloc(model.ctx, false);
         ggml_tensor* seqs = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, info.frames, info.channels);
         ggml_set_no_alloc(model.ctx, true);
 
-        // Load audio input
-        sf_readf_float(sndfile, (float*)seqs->data, info.frames);
-
-        // Audio encoder
-        ggml_cgraph* gf = unity_speech_encoder(model, seqs);
-        ggml_allocr_alloc_graph(fwd_alloc, gf);
-        ggml_graph_compute_with_ctx(model.ctx, gf, params.n_threads);
-        // encoder_output is valid until we call `ggml_allocr_reset(fwd_alloc)`
-        ggml_tensor* encoder_output = gf->nodes[gf->n_nodes - 1];
-
-        // Beam search decoding
-        const Hypothesis* result = unity_decode(model, params.opts, tgt_lang_idx, encoder_output, params.n_threads);
-    
-        // Drop language and bos token.
-        ggml_tensor* tokens = ggml_slice(model.ctx, result[0].seq, 0, 2, 0);
-
-        // Collect result string
-        int n = fairseq2_spm_detokenize(&model, tokens, (char*)&result_str);
-        std::cout << std::string((char*)&result_str, n) << std::endl;
-        ggml_free(model.ctx);
-        ggml_allocr_reset(fwd_alloc);
+        Result result = unity_eval(model, data, params.opts, tgt_lang, params.n_threads, ctx_size_mb);
+        std::string concat_transcription = std::accumulate(std::next(result.transcription.begin()), result.transcription.end(), result.transcription[0],
+            [](const std::string& a, const std::string& b) {
+                return a + " " + b;
+            }
+        );
+        if (params.verbose) {
+            std::cout << "Final transcription: " << concat_transcription << std::endl;
+            std::cout << std::endl;
+            std::cout << "Word level confidence score:" << std::endl;
+            for (size_t i = 0; i < result.transcription.size(); ++i) {
+                std::cout << "Word: " << result.transcription[i] << " | Score: " << result.word_confidence_scores[i] << std::endl;
+            }
+            std::cout << std::endl;
+            std::cout << "LID scores: " << std::endl;
+            for (const auto& kv : result.lid_scores) {
+                std::cout << "Language: " << kv.first << "| Score: " << kv.second << std::endl;
+            }
+        } else {
+            std::cout << concat_transcription << std::endl;
+        }
     }
 
     return 0;