1 year ago · 34575dc9b3
--- a/ggml/examples/unity/fairseq2.cpp
+++ b/ggml/examples/unity/fairseq2.cpp
@@ -1580,7 +1580,7 @@ extern "C" Hypothesis* generate_sequence(
 
															             ((float*)scores->data)[step_nr + 1 + i * max_seq_len] = ggml_get_f32_1d(next_scores, i);
														
 
															         }
														
 
															-        printf_mem_usage(step_ctx, "  step_ctx");
														
 
															+        printf_mem_usage(step_ctx, "step_ctx");
														
 
															         ggml_free(prev_step_ctx);
														
 
															         prev_step_ctx = step_ctx;
														
 
															 #if DEBUG_MEM_USAGE
														
@@ -1656,7 +1656,7 @@ struct llm_bigram_spm {
 
															 struct llm_tokenizer_spm {
														
 
															     llm_tokenizer_spm(const llama_vocab & vocab): vocab(vocab) {}
														
 
															-    void tokenize(const std::string& input_text, ggml_tensor& output) {
														
 
															+    void tokenize(const std::string& input_text, ggml_tensor* output) {
														
 
															         llama_vocab::id unk_idx = vocab.token_to_id.at("<unk>");
														
 
															         // split string into utf8 chars
														
@@ -1724,8 +1724,8 @@ struct llm_tokenizer_spm {
 
															             try_add_bigram(bigram.left, left_sym.next);
														
 
															         }
														
 
															-        llama_vocab::id* out = (llama_vocab::id*)output.data;
														
 
															-        int out_step = sizeof(llama_vocab::id) / output.nb[0];
														
 
															+        llama_vocab::id* out = (llama_vocab::id*)output->data;
														
 
															+        int out_step = sizeof(llama_vocab::id) / output->nb[0];
														
 
															         int num_tokens = 0;
														
 
															         for (int i = 0; i > -1; i = symbols[i].next) {
														
 
															             llm_symbol& symbol = symbols[i];
														
@@ -1734,7 +1734,7 @@ struct llm_tokenizer_spm {
 
															         }
														
 
															         *(out + num_tokens * out_step) = vocab.token_to_id.at("</s>");
														
 
															         num_tokens += 1;
														
 
															-        output.ne[0] = num_tokens;
														
 
															+        output->ne[0] = num_tokens;
														
 
															     }
														
 
															 private:
														
@@ -1773,7 +1773,7 @@ private:
 
															 };
														
 
															-extern "C" void fairseq2_spm_tokenize(fairseq2_model* model, const char* text, ggml_tensor& out) {
														
 
															+extern "C" void fairseq2_spm_tokenize(fairseq2_model* model, const char* text, ggml_tensor* out) {
														
 
															     llm_tokenizer_spm spm = {model->vocab};
														
 
															     spm.tokenize(std::string(text), out);
														
 
															 }
														
--- a/ggml/examples/unity/fairseq2.h
+++ b/ggml/examples/unity/fairseq2.h
@@ -199,6 +199,13 @@ extern "C" ggml_tensor* StandardTransformerEncoderLayer_forward(
 
															     ggml_tensor* padding_mask
														
 
															 );
														
 
															+extern "C" ggml_tensor* StandardTransformerEncoder_forward(
														
 
															+    fairseq2_model& model,
														
 
															+    const std::string& prefix,
														
 
															+    ggml_tensor* seqs,
														
 
															+    ggml_tensor* padding_mask
														
 
															+);
														
 
															+
														
 
															 extern "C" ggml_tensor* RelativePositionMHA_forward(
														
 
															     fairseq2_model& model,
														
 
															     const std::string& prefix,
														
@@ -317,8 +324,7 @@ extern "C" Hypothesis* generate_sequence(
 
															     int threads
														
 
															 );
														
 
															-extern "C" void fairseq2_spm_tokenize(fairseq2_model* model, const char* text, ggml_tensor& out);
														
 
															-
														
 
															+extern "C" void fairseq2_spm_tokenize(fairseq2_model* model, const char* text, ggml_tensor* out);
														
 
															 extern "C" std::size_t fairseq2_spm_detokenize(fairseq2_model* model, ggml_tensor* tokens, char* out);
														
 
															 std::pair<std::vector<std::string>, std::vector<float>> fairseq2_spm_detokenize(fairseq2_model* model, ggml_tensor* tokens, ggml_tensor* scores, char* out);
														
--- a/ggml/examples/unity/lib/unity_lib.cpp
+++ b/ggml/examples/unity/lib/unity_lib.cpp
@@ -2,6 +2,23 @@
 
															 #include <algorithm>
														
 
															+struct ggml_cgraph * unity_text_encoder(
														
 
															+        fairseq2_model & model,
														
 
															+        struct ggml_tensor * text_input) {
														
 
															+    ggml_context* ctx0 = model.ctx;
														
 
															+    ggml_cgraph* gf = ggml_new_graph(ctx0);
														
 
															+    ggml_tensor* seqs = TransformerEmbeddingFrontend_forward(model, "text_encoder_frontend", text_input);
														
 
															+    ggml_tensor* encoder_output = StandardTransformerEncoder_forward(
														
 
															+        model,
														
 
															+        "text_encoder",
														
 
															+        seqs,
														
 
															+        nullptr  // TODO: handle padding mask
														
 
															+    );
														
 
															+    encoder_output = ggml_dup(model.ctx, encoder_output);
														
 
															+    ggml_build_forward_expand(gf, encoder_output);
														
 
															+    return gf;
														
 
															+}
														
 
															+
														
 
															 struct ggml_cgraph * unity_speech_encoder(
														
 
															         fairseq2_model& model,
														
 
															         struct ggml_tensor * speech_input) {
														
@@ -43,7 +60,7 @@ extern "C" fairseq2_model unity_init_model(const char* model_path) {
 
															 }
														
 
															 //  struct as return - transcription, CE score, LID 
														
 
															-extern "C" Result unity_eval(fairseq2_model model, std::vector<float> data, SequenceGeneratorOptions opts, std::string tgt_lang, int n_threads, int memory_mb) {
														
 
															+extern "C" Result unity_eval_speech(fairseq2_model& model, std::vector<float>& data, SequenceGeneratorOptions opts, std::string tgt_lang, int n_threads) {
														
 
															     Result result;
														
 
															     // The ctx_size_mb mostly depends of input length and model dim.
														
 
															     int ctx_size_mb = opts.mem_mb;
														
@@ -101,10 +118,69 @@ extern "C" Result unity_eval(fairseq2_model model, std::vector<float> data, Sequ
 
															         lid_scores[model.vocab.id_to_token[lang_ids[i]].text] = ggml_get_f32_1d(hypo[0].lid_scores, i); 
														
 
															     }
														
 
															+    
														
 
															     result.transcription = result_tokens;
														
 
															     result.word_confidence_scores = word_scores;
														
 
															     result.lid_scores = lid_scores;
														
 
															     result.err = 0;
														
 
															+    ggml_free(model.ctx);
														
 
															+    ggml_allocr_reset(fwd_alloc);
														
 
															+    return result;
														
 
															+}
														
 
															+
														
 
															+
														
 
															+extern "C" Result unity_eval_text(fairseq2_model& model, const std::string& text, SequenceGeneratorOptions opts, std::string tgt_lang, int n_threads) {
														
 
															+    Result result;
														
 
															+    // The ctx_size_mb mostly depends of input length and model dim.
														
 
															+    int ctx_size_mb = opts.mem_mb;
														
 
															+    auto encoder_buf = std::vector<uint8_t>(ctx_size_mb * 1024 * 1024);
														
 
															+    auto encoder_fwd_buf = std::vector<uint8_t>(ctx_size_mb * 1024 * 1024);
														
 
															+    ggml_allocr* fwd_alloc = ggml_allocr_new(encoder_fwd_buf.data(), encoder_fwd_buf.capacity(), 8);
														
 
															+    int tgt_lang_idx;
														
 
															+    auto tgt_lang_ptr = model.vocab.token_to_id.find("__" + tgt_lang + "__"); 
														
 
															+    if (tgt_lang_ptr == model.vocab.token_to_id.end()) {
														
 
															+        std::cerr << "Unknown language " << tgt_lang << "\n";
														
 
															+        result.err = 1;
														
 
															+        return result;
														
 
															+    }
														
 
															+    tgt_lang_idx = tgt_lang_ptr->second;
														
 
															+
														
 
															+    // tokenize the input text
														
 
															+    model.ctx = ctx_from_buffer(encoder_buf);
														
 
															+    ggml_set_no_alloc(model.ctx, false);
														
 
															+    ggml_tensor* tokens = ggml_new_tensor_1d(model.ctx, GGML_TYPE_I32, 64);
														
 
															+    ggml_set_no_alloc(model.ctx, true);
														
 
															+    fairseq2_spm_tokenize(&model, text.c_str(), tokens);
														
 
															+    
														
 
															+    // Text encoder
														
 
															+    ggml_cgraph* gf = unity_text_encoder(model, tokens);
														
 
															+    ggml_allocr_alloc_graph(fwd_alloc, gf);
														
 
															+    ggml_graph_compute_with_ctx(model.ctx, gf, n_threads);
														
 
															+    ggml_tensor* encoder_output = gf->nodes[gf->n_nodes - 1];
														
 
															+    
														
 
															+    // Beam search decoding
														
 
															+    const Hypothesis* hypo = unity_decode(model, opts, tgt_lang_idx, encoder_output, n_threads);
														
 
															+    
														
 
															+    // Drop language and bos token.
														
 
															+    ggml_tensor* tgt_tokens = ggml_slice(model.ctx, hypo[0].seq, 0, 2, 0);
														
 
															+    // Collect result string
														
 
															+    char result_str[4096];
														
 
															+
														
 
															+    std::pair<std::vector<std::string>, std::vector<float>> p = fairseq2_spm_detokenize(&model, tgt_tokens, hypo[0].step_scores, (char*)&result_str);
														
 
															+    std::vector<std::string> result_tokens = p.first;
														
 
															+    std::vector<float> word_scores = p.second;
														
 
															+
														
 
															+    std::unordered_map<std::string, float> lid_scores;
														
 
															+    std::vector<int> lang_ids;
														
 
															+    for (const auto& kv : model.vocab.token_to_id) {
														
 
															+        if (kv.first.substr(0, 2) == "__" && kv.first.substr(kv.first.size() - 2) == "__") {
														
 
															+            lang_ids.push_back(kv.second);
														
 
															+        }
														
 
															+    }
														
 
															+    std::sort(lang_ids.begin(), lang_ids.end());
														
 
															+    for (size_t i = 0; i < lang_ids.size(); ++i) {
														
 
															+        lid_scores[model.vocab.id_to_token[lang_ids[i]].text] = ggml_get_f32_1d(hypo[0].lid_scores, i); 
														
 
															+    }
														
 
															     result.transcription = result_tokens;
														
 
															     result.word_confidence_scores = word_scores;
														
--- a/ggml/examples/unity/lib/unity_lib.h
+++ b/ggml/examples/unity/lib/unity_lib.h
@@ -26,7 +26,13 @@ struct Result {
 
															 struct ggml_cgraph * unity_speech_encoder(
														
 
															     fairseq2_model& model,
														
 
															-    struct ggml_tensor * speech_input);
														
 
															+    struct ggml_tensor * speech_input
														
 
															+);
														
 
															+
														
 
															+struct ggml_cgraph * unity_text_encoder(
														
 
															+    fairseq2_model& model,
														
 
															+    struct ggml_tensor * text_input
														
 
															+);
														
 
															 Hypothesis* unity_decode(
														
 
															         fairseq2_model& model,
														
@@ -38,4 +44,18 @@ Hypothesis* unity_decode(
 
															 extern "C" fairseq2_model unity_init_model(const char* model_path);
														
 
															-extern "C" Result unity_eval(fairseq2_model model, std::vector<float> data, SequenceGeneratorOptions opts, std::string tgt_lang, int n_threads, int memory_gb);
														
 
															+extern "C" Result unity_eval_speech(
														
 
															+    fairseq2_model& model, 
														
 
															+    std::vector<float>& data, 
														
 
															+    SequenceGeneratorOptions opts, 
														
 
															+    std::string tgt_lang, 
														
 
															+    int n_threads
														
 
															+);
														
 
															+
														
 
															+extern "C" Result unity_eval_text(
														
 
															+    fairseq2_model& model,  
														
 
															+    const std::string& text, 
														
 
															+    SequenceGeneratorOptions opts, 
														
 
															+    std::string tgt_lang, 
														
 
															+    int n_threads
														
 
															+);
														
--- a/ggml/examples/unity/unity.cpp
+++ b/ggml/examples/unity/unity.cpp
@@ -13,7 +13,8 @@
 
															 struct unity_params {
														
 
															     int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
														
 
															-    std::string model      = "seamlessM4T_medium.ggml"; // model path
														
 
															+    std::string model = "seamlessM4T_medium.ggml"; // model path
														
 
															+    std::string input_text = "";
														
 
															     std::string tgt_lang = "eng";
														
 
															     std::vector<std::string> files = {};
														
 
															     bool text = false;
														
@@ -26,7 +27,7 @@ struct unity_params {
 
															         /*len_penalty*/ 1.0,
														
 
															         /*unk_penalty*/ 0.0,
														
 
															         /*normalize_scores*/ true,
														
 
															-        /*mem_mb*/ 512,
														
 
															+        /*mem_mb*/ 512
														
 
															     };
														
 
															     bool verbose = false;
														
 
															 };
														
@@ -37,6 +38,9 @@ void unity_print_usage(int /*argc*/, char ** argv, const unity_params & params)
 
															     fprintf(stderr, "\n");
														
 
															     fprintf(stderr, "options:\n");
														
 
															     fprintf(stderr, "  -h, --help            show this help message and exit\n");
														
 
															+    fprintf(stderr, "  -i, --input           Input text for the text-2-text translation\n");
														
 
															+    fprintf(stderr, "  -l, --tgt-lang        Target translation lang (default: %s\n", params.tgt_lang);
														
 
															+
														
 
															     fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
														
 
															     fprintf(stderr, "  -v, --verbose         Print out word level confidence score and LID score (default: off)");
														
 
															     fprintf(stderr, "  -m FNAME, --model FNAME\n");
														
@@ -67,6 +71,8 @@ bool unity_params_parse(int argc, char ** argv, unity_params & params) {
 
															             params.n_threads = std::stoi(get_next_arg(i, argc, argv, arg, params));
														
 
															         } else if (arg == "-m" || arg == "--model") {
														
 
															             params.model = get_next_arg(i, argc, argv, arg, params);
														
 
															+        } else if (arg == "-i" || arg == "--input") {
														
 
															+            params.input_text = get_next_arg(i, argc, argv, arg, params);
														
 
															         } else if (arg == "-l" || arg == "--tgt-lang") {
														
 
															             params.tgt_lang = get_next_arg(i, argc, argv, arg, params);
														
 
															         } else if (arg == "--text") {
														
@@ -108,8 +114,13 @@ int main(int argc, char ** argv) {
 
															     char result_str[4096];
														
 
															     std::string input;
														
 
															-    bool interactive = params.files.size() == 0;
														
 
															+    bool interactive = (params.files.size() == 0 && params.input_text.length() == 0);
														
 
															     auto next_file = params.files.begin();
														
 
															+
														
 
															+    // Flag for the input case: true --> s2st, false --> t2tt
														
 
															+    bool s2st_or_t2tt = true;
														
 
															+
														
 
															+    // S2ST
														
 
															     while (true) {
														
 
															         if (interactive) {
														
 
															             std::cout << "\nEnter audio_path and tgt_lang, separated by space (or 'exit' to quit):\n";
														
@@ -118,7 +129,10 @@ int main(int argc, char ** argv) {
 
															                 break;
														
 
															             }
														
 
															         } else {
														
 
															-            if (next_file == params.files.end()) break;
														
 
															+            if (params.input_text.length() > 0) {
														
 
															+                break;
														
 
															+            }
														
 
															+            if (next_file == params.files.end() && s2st_or_t2tt) break;
														
 
															             input = *(next_file++);
														
 
															         }
														
 
															         std::istringstream iss(input);
														
@@ -144,7 +158,7 @@ int main(int argc, char ** argv) {
 
															         std::vector<float> data(n_frames * info.channels);
														
 
															         sf_readf_float(sndfile, data.data(), n_frames);
														
 
															-        Result result = unity_eval(model, data, params.opts, tgt_lang, params.n_threads, ctx_size_mb);
														
 
															+        Result result = unity_eval_speech(model, data, params.opts, tgt_lang, params.n_threads);
														
 
															         std::string concat_transcription = std::accumulate(std::next(result.transcription.begin()), result.transcription.end(), result.transcription[0],
														
 
															             [](const std::string& a, const std::string& b) {
														
 
															                 return a + " " + b;
														
@@ -167,5 +181,17 @@ int main(int argc, char ** argv) {
 
															         }
														
 
															     }
														
 
															+    // T2TT
														
 
															+    if (params.input_text.length() > 0) {
														
 
															+        // tokenize the input text
														
 
															+        Result result = unity_eval_text(model, params.input_text, params.opts, params.tgt_lang, params.n_threads);
														
 
															+        std::string concat_translation = std::accumulate(std::next(result.transcription.begin()), result.transcription.end(), result.transcription[0],
														
 
															+            [](const std::string& a, const std::string& b) {
														
 
															+                return a + " " + b;
														
 
															+            }
														
 
															+        );
														
 
															+        std::cout << "Translation: " << concat_translation << std::endl;
														
 
															+    }
														
 
															+
														
 
															     return 0;
														
 
															 }
														
--- a/ggml/ggml_convert.py
+++ b/ggml/ggml_convert.py
@@ -6,41 +6,51 @@
 
															 import dataclasses
														
 
															 import logging
														
 
															-import math
														
 
															 import struct
														
 
															 from enum import Enum
														
 
															 from io import BufferedWriter
														
 
															 from pathlib import Path
														
 
															-from typing import Any, Callable, Dict, List, Optional, Tuple, Union, Set, final
														
 
															+from typing import Any, Callable, Dict, List, Optional, Tuple, Union, Sequence, Set, final
														
 
															+import re
														
 
															 import torch
														
 
															 from fairseq2.assets import AssetCard
														
 
															 from fairseq2.models.transformer.frontend import TransformerEmbeddingFrontend
														
 
															 from fairseq2.nn import SinusoidalPositionEncoder
														
 
															 from fairseq2.nn.transformer import RelativePositionalEncoding
														
 
															-from seamless_communication.models import unity
														
 
															-from fairseq2.data.text import SentencePieceTokenizerBase
														
 
															-from fairseq2.data.typing import PathLike
														
 
															-from typing import Sequence
														
 
															 from fairseq2.data.text import SentencePieceEncoder, SentencePieceTokenizerBase
														
 
															+from fairseq2.data.typing import PathLike
														
 
															 from fairseq2.typing import Device, finaloverride
														
 
															-from fairseq2.models.utils import TokenizerLoaderBase
														
 
															+from fairseq2.models.utils import TokenizerLoaderBase, ModelLoader
														
 
															+from fairseq2.models.utils.checkpoint import convert_model_state_dict
														
 
															 from fairseq2.assets import asset_store, download_manager
														
 
															-from seamless_communication.models.unity.builder import UnitYConfig, create_unity_model
														
 
															-from fairseq2.models.utils import ModelLoader
														
 
															-from seamless_communication.models.unity.model import UnitYModel
														
 
															 import ggml
														
 
															-import re
														
 
															 Preprocessor = Callable[[Any], Any]
														
 
															 log = logging.getLogger("ggml_convert")
														
 
															-SMALLER_MODELS = [
														
 
															+
														
 
															+
														
 
															+class ModelType(str, Enum):
														
 
															+    AUTO = "auto"  # inferred from the model name
														
 
															+    UNITY = "unity"
														
 
															+    NLLB = "nllb"
														
 
															+
														
 
															+
														
 
															+UNITY_SMALLER_MODELS = [
														
 
															     "unity_nano",
														
 
															     "unity_micro",
														
 
															 ]  # Trained with fairseq2, with custom dict (not original NLLB ones)
														
 
															+NLLB_2_UNITY_KEYMAP = {
														
 
															+    r"^encoder_frontend\.": r"text_encoder_frontend.",
														
 
															+    r"^encoder\."         : r"text_encoder.",
														
 
															+    r"^decoder\."         : r"text_decoder.",
														
 
															+    r"^decoder_frontend\.": r"text_decoder_frontend.",
														
 
															+}
														
 
															+
														
 
															+
														
 
															 @final
														
 
															 class NllbLikeTokenizer(SentencePieceTokenizerBase):
														
 
															     """The only difference between this class and NllbTokenizer is it doesn't add a <pad> to control symbol list.
														
@@ -141,16 +151,6 @@ class NllbLikeTokenizer(SentencePieceTokenizerBase):
 
															         )
														
 
															-load_unity_model_without_conversion = ModelLoader[UnitYModel, UnitYConfig](
														
 
															-    asset_store,
														
 
															-    download_manager,
														
 
															-    unity.load_unity_config,
														
 
															-    create_unity_model,
														
 
															-    None,
														
 
															-    restrict_checkpoints=False,
														
 
															-)
														
 
															-
														
 
															-
														
 
															 @final
														
 
															 class NllbLikeTokenizerLoader(TokenizerLoaderBase[NllbLikeTokenizer]):
														
 
															     """Loads tokenizers used by NLLB models."""
														
@@ -164,44 +164,110 @@ class NllbLikeTokenizerLoader(TokenizerLoaderBase[NllbLikeTokenizer]):
 
															         return NllbLikeTokenizer(pathname, langs, default_lang)
														
 
															+def convert_unity_model(
														
 
															+    model_name: str,
														
 
															+    hparams: Optional[Dict[str, Any]] = None,
														
 
															+):
														
 
															+    from seamless_communication.models import unity
														
 
															+    from seamless_communication.models.unity.builder import UnitYConfig, create_unity_model
														
 
															+    from seamless_communication.models.unity.model import UnitYModel
														
 
															+
														
 
															+    load_unity_model_without_conversion = ModelLoader[UnitYModel, UnitYConfig](
														
 
															+        asset_store,
														
 
															+        download_manager,
														
 
															+        unity.load_unity_config,
														
 
															+        create_unity_model,
														
 
															+        None,
														
 
															+        restrict_checkpoints=False,
														
 
															+    )
														
 
															+
														
 
															+    model_config = unity.load_unity_config(model_name)
														
 
															+    hparams = flatten_config(
														
 
															+        dataclasses.asdict(model_config), separator="__", overrides=hparams
														
 
															+    )
														
 
															+    log.info(hparams)
														
 
															+    # Need the diverge here because current default in SC is to convert from fairseq1 ckpt format
														
 
															+    if model_name in UNITY_SMALLER_MODELS:
														
 
															+        model = load_unity_model_without_conversion(model_name)
														
 
															+        tokenizer = NllbLikeTokenizerLoader(asset_store, download_manager)(model_name)
														
 
															+    else:
														
 
															+        model = unity.load_unity_model(model_name)
														
 
															+        tokenizer = unity.load_unity_text_tokenizer(model_name)
														
 
															+
														
 
															+    vocab = read_vocab(tokenizer)
														
 
															+
														
 
															+    return model, hparams, vocab
														
 
															+
														
 
															+
														
 
															+def convert_nllb_model(
														
 
															+    model_name: str,
														
 
															+    hparams: Optional[Dict[str, Any]] = None,
														
 
															+):
														
 
															+    from fairseq2.models.nllb.loader import load_nllb_tokenizer, load_nllb_model, load_nllb_config
														
 
															+
														
 
															+    model_config = load_nllb_config(model_name)
														
 
															+    hparams = flatten_config(
														
 
															+        dataclasses.asdict(model_config), separator="__", overrides=hparams,
														
 
															+    )
														
 
															+
														
 
															+    model = load_nllb_model(model_name)
														
 
															+    tokenizer = load_nllb_tokenizer(model_name)
														
 
															+    vocab = read_vocab(tokenizer)
														
 
															+
														
 
															+    return model, hparams, vocab
														
 
															+
														
 
															+
														
 
															 def convert_model(
														
 
															     model_name: Union[str, torch.nn.Module],
														
 
															     out: Optional[Path] = None,
														
 
															+    model_type: ModelType = ModelType.AUTO,
														
 
															     layers: str = "",
														
 
															     hparams: Optional[Dict[str, Any]] = None,
														
 
															     vocab: Optional[List[Tuple[str, float]]] = None,
														
 
															     fp16: bool = False,
														
 
															 ) -> None:
														
 
															+    """
														
 
															+    Entry function for converting different kinds of model into GGML file. Supported model checkpoints:
														
 
															+        - unity models
														
 
															+        - nllb models
														
 
															+    Args:
														
 
															+        model_name: name of a registered model (discoverable in a fairseq2 asset), path to a checkpoint,\
														
 
															+            or the model object passed directly
														
 
															+        out: path to store the converted .ggml model. If None, the ggml model is stored in the same place\
														
 
															+            as input model
														
 
															+        model_type: type of the model (or inferred from the name, only applied to nllb, unity and seamless)
														
 
															+        layers: wildcard patterns to filter the layers from the model. Does not applied to scripted models
														
 
															+        hparams: override the hparams in the model with the user-defined values
														
 
															+        vocab: list of tokens, or aPath to  vocabulary files (in case not bundled with the model checkpoint)
														
 
															+        fp16: Save to .GGML float16 tensors instead of float32
														
 
															+    """
														
 
															+    key_map: Optional[Dict[str, str]] = None
														
 
															     if isinstance(model_name, str):
														
 
															         # Load the corresponding fairseq2 model
														
 
															         if out is None:
														
 
															             out = Path(model_name).with_suffix(".ggml")
														
 
															-        # The type of model depends on the name
														
 
															-        if "unity" in model_name or "seamlessM4T" in model_name:
														
 
															-            if hparams is None:
														
 
															-                model_config = unity.load_unity_config(model_name)
														
 
															-                hparams = flatten_config(
														
 
															-                    dataclasses.asdict(model_config), separator="__"
														
 
															-                )
														
 
															-                log.info(hparams)
														
 
															-            # Need the diverge here because current default in SC is to convert from fairseq1 ckpt format
														
 
															-            if model_name in SMALLER_MODELS:
														
 
															-                model = load_unity_model_without_conversion(model_name)
														
 
															+        # Reason the model architecture from the model name or user input
														
 
															+        try:
														
 
															+            if model_type == ModelType.AUTO:
														
 
															+                if "unity" in model_name or "seamlessM4T" in model_name:
														
 
															+                    model_type = ModelType.UNITY
														
 
															+                elif "nllb" in model_name:
														
 
															+                    model_type = ModelType.NLLB
														
 
															+
														
 
															+            assert (
														
 
															+                model_type != ModelType.AUTO
														
 
															+            ), "Cannot infer model type from the `model_name`. Please specify `model_type`"
														
 
															+
														
 
															+            if model_type == ModelType.UNITY:
														
 
															+                model, hparams, vocab = convert_unity_model(model_name, hparams=hparams)
														
 
															+            elif model_type == ModelType.NLLB:
														
 
															+                model, hparams, vocab = convert_nllb_model(model_name, hparams=hparams)
														
 
															+                key_map = NLLB_2_UNITY_KEYMAP
														
 
															             else:
														
 
															-                model = unity.load_unity_model(model_name)
														
 
															-            if vocab is None:
														
 
															-                # Need the diverge here because current default in SC is to add a separate <pad>
														
 
															-                # as control symbol in NllbTokenizer
														
 
															-                if model_name in SMALLER_MODELS:
														
 
															-                    tokenizer = NllbLikeTokenizerLoader(asset_store, download_manager)(
														
 
															-                        model_name
														
 
															-                    )
														
 
															-                else:
														
 
															-                    tokenizer = unity.load_unity_text_tokenizer(model_name)
														
 
															-                vocab = read_vocab(tokenizer)
														
 
															-        else:
														
 
															-            raise ValueError(f"Unsupported model type: {model_name}")
														
 
															+                raise ValueError(f"Unsupported model type: {model_name} (type: {model_type})")
														
 
															+        except Exception as exc:
														
 
															+            raise ValueError(f"Error in loading model: {model_name}") from exc
														
 
															     else:
														
 
															         # Use the model passed explicitly
														
 
															         assert (
														
@@ -214,21 +280,14 @@ def convert_model(
 
															     if layers:
														
 
															         state_dict = {k: v for k, v in state_dict.items() if re.match(layers, k)}
														
 
															     fixup_model(model, state_dict, layer_filter=layers)
														
 
															-    layer_config = read_layer_config(model, layer_filter=layers)
														
 
															+    if key_map:
														
 
															+        state_dict = convert_model_state_dict(state_dict, key_map=key_map)
														
 
															+    layer_config = read_layer_config(model, layer_filter=layers, key_map=key_map)
														
 
															+
														
 
															     vocab = vocab or []
														
 
															     write_ggml_file(out, hparams, layer_config, vocab, state_dict, fp16)
														
 
															-def _nested_getattr(model: Any, name: str) -> Any:
														
 
															-    parts = name.split(".")
														
 
															-    node = model
														
 
															-    for part in parts:
														
 
															-        node = getattr(node, part)
														
 
															-        if node is None:
														
 
															-            return None
														
 
															-    return node
														
 
															-
														
 
															-
														
 
															 def find_children(model: torch.nn.Module, t: type, layer_filter: str = "") -> List[Tuple[str, torch.nn.Module]]:
														
 
															     queue = list(model._modules.items())
														
 
															     modules = []
														
@@ -385,10 +444,12 @@ def write_state_dict(
 
															         # Compressed size
														
 
															         compressed_byte_size = sum(_fp16_byte_size(x) for x in state_dict.values())
														
 
															         log.warning(
														
 
															-            f"Saving a ggml file with {len(state_dict)} tensors, totalling {true_byte_size / GB:.3f}Gb compressed to {compressed_byte_size / GB:.3f}"
														
 
															+            f"Saving a ggml file with {len(state_dict)} tensors, totalling {true_byte_size / GB:.3f}Gb"
														
 
															+            f". Compressed to {compressed_byte_size / GB:.3f}Gb"
														
 
															         )
														
 
															     for key, value in state_dict.items():
														
 
															+        # Rename the layers to make it look like "unity-arch"
														
 
															         write_string(out, key)
														
 
															         if key.endswith(".bias") and value.ndim == 1 and "adaptor" not in key:
														
 
															             # GGML broadcasting isn't as strong as numpy
														
@@ -463,7 +524,7 @@ def torch_to_ggml_type(dtype: torch.dtype) -> int:
 
															 def flatten_config(
														
 
															     config: Dict[str, Any],
														
 
															     separator: str,
														
 
															-    config_preprocessor: Optional[Preprocessor] = None,
														
 
															+    overrides: Optional[Dict[str, Any]] = None,
														
 
															 ) -> Dict[str, Any]:
														
 
															     """Flatten nested dictionnary
														
@@ -478,9 +539,6 @@ def flatten_config(
 
															         flat dictionnary
														
 
															     """
														
 
															-    if config_preprocessor is None:
														
 
															-        config_preprocessor = lambda x: x
														
 
															-
														
 
															     def __flatten(config: Dict[str, Any], prefix: str = "") -> Dict[str, Any]:
														
 
															         result = {}
														
 
															         for key in config:
														
@@ -489,16 +547,22 @@ def flatten_config(
 
															                 nested_result = __flatten(config[key], f"{new_key}{separator}")
														
 
															                 result.update(nested_result)
														
 
															             else:
														
 
															-                new_config = config_preprocessor(config[key])
														
 
															+                new_config = config[key]
														
 
															                 if new_config is not None:
														
 
															                     result[new_key] = config[key]
														
 
															         return result
														
 
															-    return __flatten(config)
														
 
															+    res_config = __flatten(config)
														
 
															+    if overrides:
														
 
															+        return {**res_config, **overrides}
														
 
															+    else:
														
 
															+        return res_config
														
 
															-def read_layer_config(model: torch.nn.Module, layer_filter: str) -> Dict[str, Any]:
														
 
															+def read_layer_config(
														
 
															+    model: torch.nn.Module, layer_filter: str, key_map: Optional[Dict[str, str]] = None
														
 
															+) -> Dict[str, Any]:
														
 
															     layer_config = {}
														
 
															     def _append_node_config(node: Any, prefix: str) -> None:
														
@@ -523,6 +587,15 @@ def read_layer_config(model: torch.nn.Module, layer_filter: str) -> Dict[str, An
 
															     _append_node_config(model, "")
														
 
															     for name, node in find_children(model, torch.nn.Module, layer_filter):
														
 
															         _append_node_config(node, name + ".")
														
 
															+
														
 
															+    key_map = key_map or {}
														
 
															+    keys_to_replace = []
														
 
															+    for k, v in layer_config.items():
														
 
															+        for old_pattern, replacement in key_map.items():
														
 
															+            if (new_key := re.sub(old_pattern, replacement, k)) != k:
														
 
															+                keys_to_replace.append((k, new_key))
														
 
															+    for old_key, new_key in keys_to_replace:
														
 
															+        layer_config[new_key] = layer_config.pop(old_key)
														
 
															     return layer_config
														
--- a/ggml/include/ggml/ggml.h
+++ b/ggml/include/ggml/ggml.h
@@ -215,13 +215,13 @@
 
															 #define GGML_QNT_VERSION_FACTOR 1000 // do not change this
														
 
															 #define GGML_MAX_DIMS           4
														
 
															-#define GGML_MAX_PARAMS         2048
														
 
															+#define GGML_MAX_PARAMS         4096
														
 
															 #define GGML_MAX_CONTEXTS       64
														
 
															 #define GGML_MAX_SRC            10
														
 
															 #define GGML_MAX_NAME           64
														
 
															 #define GGML_MAX_OP_PARAMS      64
														
 
															 #define GGML_DEFAULT_N_THREADS  4
														
 
															-#define GGML_DEFAULT_GRAPH_SIZE 2048
														
 
															+#define GGML_DEFAULT_GRAPH_SIZE 4096
														
 
															 #if UINTPTR_MAX == 0xFFFFFFFF
														
 
															     #define GGML_MEM_ALIGN 4
														
 
															 #else