Browse Source

Add back unity lib files (#344)

* sync ggml to public SC

* update dependency versions to get rid of dependabot alerts

* add unity/lib

* revert unexpected changes

* revert unexpected changes
Ning 1 year ago
parent
commit
1862dea5b7
4 changed files with 261 additions and 2 deletions
  1. 0 1
      .gitignore
  2. 199 0
      ggml/examples/unity/lib/unity_lib.cpp
  3. 61 0
      ggml/examples/unity/lib/unity_lib.h
  4. 1 1
      ggml/requirements.txt

+ 0 - 1
.gitignore

@@ -21,7 +21,6 @@ dist/
 downloads/
 eggs/
 .eggs/
-lib/
 lib64/
 parts/
 sdist/

+ 199 - 0
ggml/examples/unity/lib/unity_lib.cpp

@@ -0,0 +1,199 @@
+#include "unity_lib.h"
+#include <algorithm>
+#include <stdexcept>
+
+
+struct ggml_cgraph * unity_text_encoder(
+        fairseq2_model & model,
+        struct ggml_tensor * text_input) {
+    ggml_context* ctx0 = model.ctx;
+    ggml_cgraph* gf = ggml_new_graph(ctx0);
+    ggml_tensor* seqs = TransformerEmbeddingFrontend_forward(model, "text_encoder_frontend", text_input);
+    ggml_tensor* encoder_output = StandardTransformerEncoder_forward(
+        model,
+        "text_encoder",
+        seqs,
+        nullptr  // TODO: handle padding mask
+    );
+    encoder_output = ggml_dup(model.ctx, encoder_output);
+    ggml_build_forward_expand(gf, encoder_output);
+    return gf;
+}
+
+struct ggml_cgraph * unity_speech_encoder(
+        fairseq2_model& model,
+        struct ggml_tensor * speech_input) {
+    ggml_context* ctx0 = model.ctx;
+    ggml_cgraph* gf = ggml_new_graph(ctx0);
+    ggml_tensor* seqs = StandardConformerEncoder_forward(model, "speech_encoder", speech_input, nullptr);
+    seqs = ggml_dup(model.ctx, seqs);
+    ggml_build_forward_expand(gf, seqs);
+    return gf;
+}
+
+Hypothesis* unity_decode(
+        fairseq2_model& model,
+        const SequenceGeneratorOptions& opts,
+        int tgt_lang_idx,
+        ggml_tensor* encoder_output,
+        int n_threads
+) {
+    SequenceGeneratorJob job = {
+        opts,
+        /*prefix_seq*/ nullptr,
+        /*pad_idx*/model.vocab.token_to_id["<pad>"],
+        /*unk_idx*/model.vocab.token_to_id["<unk>"],
+        /*bos_idx*/model.vocab.token_to_id["<s>"],
+        /*eos_idx*/model.vocab.token_to_id["</s>"],
+        /*num_threads*/n_threads,
+    };
+    int prefix_seq_len = tgt_lang_idx ? 2 : 1;
+    FORCE_ALLOC(prefix_seq, model.ctx, ggml_new_tensor_1d(model.ctx, GGML_TYPE_I32, prefix_seq_len));
+    ((int *)prefix_seq->data)[0]  = job.eos_idx;
+    if (tgt_lang_idx != 0) { // multilingual case
+        ((int *)prefix_seq->data)[1]  = tgt_lang_idx;
+    }
+    job.prefix_seq = prefix_seq;
+    return generate_sequence(model, job, encoder_output, nullptr, model.ctx, n_threads);
+}
+
+extern "C" fairseq2_model unity_init_model(const char* model_path) {
+    fairseq2_model model;
+    load_fairseq2_ggml_file(model, model_path);
+    return model;
+}
+
+//  struct as return - transcription, CE score, LID 
+extern "C" Result unity_eval_speech(fairseq2_model& model, std::vector<float>& data, SequenceGeneratorOptions opts, std::string tgt_lang, int n_threads) {
+    Result result;
+    // The ctx_size_mb mostly depends of input length and model dim.
+    int ctx_size_mb = opts.mem_mb;
+    auto encoder_buf = std::vector<uint8_t>(8 * 1024 * 1024);  // this is only for tensor metadata, it can be small
+    auto encoder_fwd_buf = std::vector<uint8_t>(ctx_size_mb * 1024 * 1024);
+    ggml_allocr* fwd_alloc = ggml_allocr_new(encoder_fwd_buf.data(), encoder_fwd_buf.capacity(), 8);
+    int tgt_lang_idx;
+    if (tgt_lang == "unk") {
+        tgt_lang_idx = model.vocab.token_to_id["<unk>"];
+    } else {
+        auto tgt_lang_ptr = model.vocab.token_to_id.find("__" + tgt_lang + "__"); 
+        if (tgt_lang_ptr == model.vocab.token_to_id.end()) {
+            std::cerr << "Unknown language " << tgt_lang << "\n";
+            result.err = 1;
+            return result;
+        }
+        tgt_lang_idx = tgt_lang_ptr->second;
+    }
+
+
+    // Reset the ggml_context
+    model.ctx = ctx_from_buffer(encoder_buf);
+    ggml_set_no_alloc(model.ctx, true);
+    ggml_tensor* seqs = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, data.size(), 1);
+    seqs->data = data.data();
+
+    // Audio encoder
+    ggml_cgraph* gf = unity_speech_encoder(model, seqs);
+    ggml_allocr_alloc_graph(fwd_alloc, gf);
+    ggml_graph_compute_with_ctx(model.ctx, gf, n_threads);
+    // encoder_output is valid until we call `ggml_allocr_reset(fwd_alloc)`
+    ggml_tensor* encoder_output = gf->nodes[gf->n_nodes - 1];
+
+    // Beam search decoding
+    const Hypothesis* hypo = unity_decode(model, opts, tgt_lang_idx, encoder_output, n_threads);
+
+    // Drop language and bos token.
+    ggml_tensor* tokens = ggml_slice(model.ctx, hypo[0].seq, 0, 2, 0);
+
+    // Collect result string
+    char result_str[4096];
+    std::pair<std::vector<std::string>, std::vector<float>> p = fairseq2_spm_detokenize(&model, tokens, hypo[0].step_scores, (char*)&result_str);
+    std::vector<std::string> result_tokens = p.first;
+    std::vector<float> word_scores = p.second;
+
+    std::unordered_map<std::string, float> lid_scores;
+    std::vector<int> lang_ids;
+    for (const auto& kv : model.vocab.token_to_id) {
+        if (kv.first.substr(0, 2) == "__" && kv.first.substr(kv.first.size() - 2) == "__") {
+            lang_ids.push_back(kv.second);
+        }
+    }
+    std::sort(lang_ids.begin(), lang_ids.end());
+    for (size_t i = 0; i < lang_ids.size(); ++i) {
+        lid_scores[model.vocab.id_to_token[lang_ids[i]].text] = ggml_get_f32_1d(hypo[0].lid_scores, i); 
+    }
+    result.transcription = result_tokens;
+    result.word_confidence_scores = word_scores;
+    result.lid_scores = lid_scores;
+    result.err = 0;
+    ggml_free(model.ctx);
+    ggml_allocr_reset(fwd_alloc);
+    return result;
+}
+
+
+extern "C" Result unity_eval_text(fairseq2_model& model, const std::string& text, SequenceGeneratorOptions opts, std::string tgt_lang, int n_threads) {
+    Result result;
+    // The ctx_size_mb mostly depends of input length and model dim.
+    int ctx_size_mb = opts.mem_mb;
+    auto encoder_buf = std::vector<uint8_t>(ctx_size_mb * 1024 * 1024);
+    auto encoder_fwd_buf = std::vector<uint8_t>(ctx_size_mb * 1024 * 1024);
+    ggml_allocr* fwd_alloc = ggml_allocr_new(encoder_fwd_buf.data(), encoder_fwd_buf.capacity(), 8);
+    int tgt_lang_idx = 0;
+    if (model.hparams["multilingual"] != 0) {
+        auto tgt_lang_ptr = model.vocab.token_to_id.find("__" + tgt_lang + "__"); 
+        if (tgt_lang_ptr == model.vocab.token_to_id.end()) {
+            std::cerr << "Unknown language " << tgt_lang << "\n";
+            result.err = 1;
+            return result;
+        }
+        tgt_lang_idx = tgt_lang_ptr->second;
+    }
+
+    // tokenize the input text
+    model.ctx = ctx_from_buffer(encoder_buf);
+    ggml_set_no_alloc(model.ctx, false);
+    ggml_tensor* tokens_tensor = ggml_new_tensor_1d(model.ctx, GGML_TYPE_I32, 64);
+    ggml_set_no_alloc(model.ctx, true);
+    fairseq2_spm_tokenize(&model, text.c_str(), tokens_tensor);
+    
+    // Text encoder
+    ggml_cgraph* gf = unity_text_encoder(model, tokens_tensor);
+    ggml_allocr_alloc_graph(fwd_alloc, gf);
+    ggml_graph_compute_with_ctx(model.ctx, gf, n_threads);
+    ggml_tensor* encoder_output = gf->nodes[gf->n_nodes - 1];
+    
+    // Beam search decoding
+    const Hypothesis* hypo = unity_decode(model, opts, tgt_lang_idx, encoder_output, n_threads);
+    
+    // Drop language and bos token for multilingual, or only bos token for the bilingual model
+    int token_offset = (model.hparams["multilingual"] != 0) ? 2 : 1;
+    ggml_tensor* tgt_tokens = ggml_slice(model.ctx, hypo[0].seq, 0, token_offset, 0);
+
+    // Collect result string
+    char result_str[4096];
+
+    std::pair<std::vector<std::string>, std::vector<float>> p = fairseq2_spm_detokenize(&model, tgt_tokens, hypo[0].step_scores, (char*)&result_str);
+    std::vector<std::string> result_tokens = p.first;
+    std::vector<float> word_scores = p.second;
+
+    std::unordered_map<std::string, float> lid_scores;
+    if (model.hparams["multilingual"] != 0) {
+        std::vector<int> lang_ids;
+        for (const auto& kv : model.vocab.token_to_id) {
+            if (kv.first.substr(0, 2) == "__" && kv.first.substr(kv.first.size() - 2) == "__") {
+                lang_ids.push_back(kv.second);
+            }
+        }
+        std::sort(lang_ids.begin(), lang_ids.end());
+        for (size_t i = 0; i < lang_ids.size(); ++i) {
+            lid_scores[model.vocab.id_to_token[lang_ids[i]].text] = ggml_get_f32_1d(hypo[0].lid_scores, i); 
+        }
+        result.lid_scores = lid_scores;
+    }
+    result.transcription = result_tokens;
+    result.word_confidence_scores = word_scores;
+    result.err = 0;
+    ggml_free(model.ctx);
+    ggml_allocr_reset(fwd_alloc);
+    return result;
+}

+ 61 - 0
ggml/examples/unity/lib/unity_lib.h

@@ -0,0 +1,61 @@
+#include "ggml/ggml.h"
+#include "ggml/ggml-alloc.h"
+
+#include "math.h"
+#include "model_loader.h"
+#include "fairseq2.h"
+
+#include <thread>
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <map>
+#include <string>
+#include <vector>
+#include <iostream>
+#include <cstdlib>
+
+struct Result {
+    std::vector<std::string> transcription;
+    std::vector<float> word_confidence_scores;
+    std::unordered_map<std::string, float> lid_scores;
+    int err;
+};
+
+struct ggml_cgraph * unity_speech_encoder(
+    fairseq2_model& model,
+    struct ggml_tensor * speech_input
+);
+
+struct ggml_cgraph * unity_text_encoder(
+    fairseq2_model& model,
+    struct ggml_tensor * text_input
+);
+
+Hypothesis* unity_decode(
+    fairseq2_model& model,
+    const SequenceGeneratorOptions& opts,
+    int tgt_lang_idx,
+    ggml_tensor* encoder_output,
+    int n_threads
+);
+
+extern "C" fairseq2_model unity_init_model(const char* model_path);
+
+extern "C" Result unity_eval_speech(
+    fairseq2_model& model, 
+    std::vector<float>& data, 
+    SequenceGeneratorOptions opts, 
+    std::string tgt_lang, 
+    int n_threads
+);
+
+extern "C" Result unity_eval_text(
+    fairseq2_model& model,  
+    const std::string& text, 
+    SequenceGeneratorOptions opts, 
+    std::string tgt_lang, 
+    int n_threads
+);

+ 1 - 1
ggml/requirements.txt

@@ -4,6 +4,6 @@ sentencepiece==0.1.98
 torch==2.0.1
 torchaudio==2.0.2
 torchvision==0.15.2
-transformers==4.29.2
+transformers==4.36.0
 fairseq2==0.2.1
 func_argparse