vor 1 Jahr · eb7810b81f
--- a/ggml/examples/unity/CMakeLists.txt
+++ b/ggml/examples/unity/CMakeLists.txt
@@ -7,5 +7,4 @@ target_sources(unity
 
				     PRIVATE
			
 
				         fairseq2.cpp
			
 
				         model_loader.cpp
			
 
				-        unity_model_loader.cpp
			
 
				 )
			
--- a/ggml/examples/unity/fairseq2.cpp
+++ b/ggml/examples/unity/fairseq2.cpp
@@ -12,25 +12,27 @@
 
				 extern "C" fairseq2_model* fairseq2_model_alloc() {
			
 
				     // pre-allocate some memory to write hyperparameters and tensors pointers
			
 
				     auto* model = new fairseq2_model;
			
 
				-    model->hparams = new std::uint8_t[8 * 1024];
			
 
				     model->tensors_ctx = nullptr;
			
 
				     return model;
			
 
				 }
			
 
				 
			
 
				 
			
 
				-double fairseq2_model_layer_config_double(const fairseq2_model& model, std::string name) {
			
 
				+inline double model_layer_config_d(const fairseq2_model& model, std::string name) {
			
 
				     const std::int64_t* data = &model.layer_config.at(name);
			
 
				     return *(double*)data;
			
 
				 }
			
 
				 
			
 
				-std::int64_t fairseq2_model_layer_config_int(const fairseq2_model& model, std::string name) {
			
 
				-    return model.layer_config.at(name);
			
 
				+extern "C" double fairseq2_model_layer_config_double(const fairseq2_model& model, const char* name) {
			
 
				+    return model_layer_config_d(model, std::string(name));
			
 
				+}
			
 
				+
			
 
				+extern "C" std::int64_t fairseq2_model_layer_config_int(const fairseq2_model& model, const char* name) {
			
 
				+    return model.layer_config.at(std::string(name));
			
 
				 }
			
 
				 
			
 
				 
			
 
				 extern "C" void fairseq2_model_free(fairseq2_model* model) {
			
 
				     if (model->tensors_ctx) ggml_free(model->tensors_ctx);
			
 
				-    delete (std::uint8_t*)model->hparams;
			
 
				     delete model;
			
 
				 }
			
 
				 
			
@@ -77,7 +79,7 @@ extern "C" ggml_tensor* LayerNorm_forward(
 
				     GGML_ASSERT(bias != nullptr);
			
 
				 
			
 
				     auto ctx = model.ctx;
			
 
				-    double eps = fairseq2_model_layer_config_double(model, prefix + ".eps");
			
 
				+    double eps = model_layer_config_d(model, prefix + ".eps");
			
 
				 
			
 
				     input = ggml_norm(ctx, input, /*eps*/eps);
			
 
				     return ggml_add_inplace(
			
--- a/ggml/examples/unity/fairseq2.h
+++ b/ggml/examples/unity/fairseq2.h
@@ -1,6 +1,6 @@
 
				 #pragma once
			
 
				 
			
 
				-#include <map>
			
 
				+#include <unordered_map>
			
 
				 #include <string>
			
 
				 #include <vector>
			
 
				 #include "ggml.h"
			
@@ -10,10 +10,17 @@
 
				 struct fairseq2_model {
			
 
				     // Context containing all tensors memory
			
 
				     ggml_context* tensors_ctx;
			
 
				+
			
 
				     // Named tensors, all tensors should belong to tensors_ctx
			
 
				-    std::map<std::string, struct ggml_tensor *> tensors;
			
 
				-    std::map<std::string, std::int64_t> layer_config;
			
 
				-    void* hparams;
			
 
				+    std::unordered_map<std::string, struct ggml_tensor *> tensors;
			
 
				+
			
 
				+    // Hashmap containing model hyper-parameters.
			
 
				+    std::unordered_map<std::string, std::int64_t> hparams;
			
 
				+
			
 
				+    // Hashmap containing layers hyper-parameters.
			
 
				+    // Normally those can be inferred from hparams, but it avoids doing this logic in GGML
			
 
				+    std::unordered_map<std::string, std::int64_t> layer_config;
			
 
				+
			
 
				     // an inference context, not managed by this object
			
 
				     // TODO: is this the best place to store this or should we also pass this to all forward methods ?
			
 
				     ggml_context* ctx;
			
@@ -147,6 +154,7 @@ extern "C" ggml_tensor* StandardConformerEncoderAdaptor_forward(
 
				     ggml_tensor* padding_mask
			
 
				 );
			
 
				 // Specifies the Layer Normalization order.
			
 
				+// see fairseq2/nn/transformer/norm_order.py
			
 
				 enum TransformerNormOrder {
			
 
				     TRANSFORMER_NORM_ORDER_POST = 0,
			
 
				     TRANSFORMER_NORM_ORDER_PRE = 1,
			
--- a/ggml/examples/unity/model_loader.cpp
+++ b/ggml/examples/unity/model_loader.cpp
@@ -35,12 +35,22 @@ void register_prefix(fairseq2_model &model, const std::string& name) {
 
				 }
			
 
				 
			
 
				 
			
 
				-int
			
 
				+std::int64_t
			
 
				 model_loader::load_model_weights(fairseq2_model &model, std::ifstream &fin)
			
 
				 {
			
 
				-    int num_tensor = 0;
			
 
				+    std::int64_t num_tensor = 0;
			
 
				+    std::int64_t ctx_size = 0;
			
 
				     fin.read((char*) &num_tensor, sizeof(num_tensor));
			
 
				-    size_t total_size = 0;
			
 
				+    fin.read((char*) &ctx_size, sizeof(ctx_size));
			
 
				+
			
 
				+    struct ggml_init_params params = {
			
 
				+        /*.mem_size   =*/ static_cast<std::size_t>(ctx_size),
			
 
				+        /*.mem_buffer =*/ NULL,
			
 
				+        /*.no_alloc   =*/ false,
			
 
				+    };
			
 
				+    model.tensors_ctx = ggml_init(params);
			
 
				+
			
 
				+    size_t model_size = 0;
			
 
				     for (int i = 0; i < num_tensor; ++i) {
			
 
				         std::string name = get_name(fin);
			
 
				         if (name.length() == 0)
			
@@ -49,7 +59,7 @@ model_loader::load_model_weights(fairseq2_model &model, std::ifstream &fin)
 
				         if (tensor == nullptr) {
			
 
				             // Abort in case of error, the input stream is corrupted at this point.
			
 
				             printf("Error while reading tensor %s\n", name.c_str() );
			
 
				-            return 1;
			
 
				+            throw std::invalid_argument("Error while reading tensor from file.");
			
 
				         }
			
 
				         register_prefix(model, name);
			
 
				         ggml_set_name(tensor, name.c_str());
			
@@ -57,27 +67,57 @@ model_loader::load_model_weights(fairseq2_model &model, std::ifstream &fin)
 
				         if (DEBUG_MODEL_LOAD) {
			
 
				             printf("%s [%5ld, %5ld], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), tensor->ne[0], tensor->ne[1], ggml_type_name(tensor->type), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
			
 
				         }
			
 
				-        total_size += ggml_nbytes(tensor);
			
 
				+        model_size += ggml_nbytes(tensor);
			
 
				     }
			
 
				 
			
 
				-    printf("%s: model size  = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
			
 
				-    return 0;
			
 
				+    double mb = 1024.0 * 1024.0;
			
 
				+    printf("%s: model size  = %8.2f MB, memory used = %8.2f MB, memory reserved = %8.2f \n",
			
 
				+        __func__,
			
 
				+        model_size / mb,
			
 
				+        ggml_used_mem(model.tensors_ctx) / mb,
			
 
				+        ctx_size / mb
			
 
				+    );
			
 
				+
			
 
				+    return ctx_size;
			
 
				+}
			
 
				+
			
 
				+void assert_endianness() {
			
 
				+    union {
			
 
				+        unsigned int i;
			
 
				+        char c[4];
			
 
				+    } un;
			
 
				+    un.i = 0x12345678;
			
 
				+
			
 
				+    if (un.c[0] == 0x78 && un.c[3] == 0x12) {
			
 
				+        printf("little-endian\n");
			
 
				+    }
			
 
				+    else if (un.c[0] == 0x12 && un.c[3] == 0x78) {
			
 
				+        printf("big-endian\n");
			
 
				+        GGML_ASSERT(false); // model_loader.cpp assumes the system is little-endian
			
 
				+    }
			
 
				+    else {
			
 
				+        printf("unknown-endian\n");
			
 
				+        GGML_ASSERT(false); // model_loader.cpp assumes the system is little-endian
			
 
				+    }
			
 
				 }
			
 
				 
			
 
				 
			
 
				-int
			
 
				-model_loader::load_layer_config(fairseq2_model &model, std::ifstream &fin)
			
 
				+void model_loader::load_hparams(std::unordered_map<std::string, std::int64_t>& hparams, std::ifstream &fin)
			
 
				 {
			
 
				+    std::int64_t num_params = 0;
			
 
				+    fin.read(reinterpret_cast<char*>(&num_params), sizeof num_params);
			
 
				+    GGML_ASSERT(fin.gcount() == 8);
			
 
				+
			
 
				+    hparams.reserve(num_params);
			
 
				+
			
 
				     std::int64_t value;
			
 
				-    while (!fin.eof()) {
			
 
				+    for (int i = 0; i < num_params; ++i) {
			
 
				         std::string name = get_name(fin);
			
 
				         if (name.length() == 0)
			
 
				             break;
			
 
				         fin.read((char*) &value, sizeof(value));
			
 
				-        model.layer_config[name] = value;
			
 
				+        hparams[name] = value;
			
 
				     }
			
 
				-
			
 
				-    return 0;
			
 
				 }
			
 
				 
			
 
				 ggml_tensor* load_tensor_value(std::ifstream &fin, ggml_context* ctx)
			
@@ -107,11 +147,21 @@ model_loader::get_name(std::ifstream& fin)
 
				 {
			
 
				     std::uint32_t length = 0;
			
 
				     fin.read(reinterpret_cast<char *>(&length), sizeof(length));
			
 
				+    if (length == 0)
			
 
				+        return "";
			
 
				+
			
 
				     std::string name(length, 0);
			
 
				-    if (length == 0) {
			
 
				-        return name;
			
 
				-    };
			
 
				     fin.read(&name[0], length);
			
 
				 
			
 
				     return name;
			
 
				 }
			
 
				+
			
 
				+extern "C" int load_fairseq2_ggml_file(fairseq2_model& model, const char* fname) {
			
 
				+    model_loader loader;
			
 
				+    assert_endianness();
			
 
				+    auto fin = open_ggml_file(fname);
			
 
				+    loader.load_hparams(model.hparams, fin);
			
 
				+    loader.load_hparams(model.layer_config, fin);
			
 
				+    loader.load_model_weights(model, fin);
			
 
				+    return 0;
			
 
				+}
			
--- a/ggml/examples/unity/model_loader.h
+++ b/ggml/examples/unity/model_loader.h
@@ -19,15 +19,9 @@
 
				 
			
 
				 class model_loader {
			
 
				 public:
			
 
				-    virtual ~model_loader() {};
			
 
				+    std::int64_t load_model_weights(fairseq2_model &model, std::ifstream &fin);
			
 
				 
			
 
				-    virtual void load_hparams(fairseq2_model& model, std::ifstream &fin) = 0;
			
 
				-
			
 
				-    virtual std::size_t compute_context_size(void *raw_hparams) = 0;
			
 
				-
			
 
				-    int load_model_weights(fairseq2_model &model, std::ifstream &fin);
			
 
				-
			
 
				-    int load_layer_config(fairseq2_model &model, std::ifstream &fin);
			
 
				+    void load_hparams(std::unordered_map<std::string, std::int64_t>& hparams, std::ifstream &fin);
			
 
				 
			
 
				 private:
			
 
				     ggml_tensor * next_tensor(std::ifstream &fin, fairseq2_model &model);
			
@@ -39,23 +33,4 @@ ggml_tensor* load_tensor_value(std::ifstream &fin, ggml_context* ctx);
 
				 
			
 
				 std::ifstream open_ggml_file(const char* fname);
			
 
				 
			
 
				-template<typename T>
			
 
				-int load_fairseq2_ggml_file(fairseq2_model& model, const char* fname) {
			
 
				-    T loader;
			
 
				-    auto fin = open_ggml_file(fname);
			
 
				-    loader.load_hparams(model, fin);
			
 
				-
			
 
				-    std::size_t ctx_size = loader.compute_context_size(model.hparams);
			
 
				-    struct ggml_init_params params = {
			
 
				-        /*.mem_size   =*/ ctx_size,
			
 
				-        /*.mem_buffer =*/ NULL,
			
 
				-        /*.no_alloc   =*/ false,
			
 
				-    };
			
 
				-    model.tensors_ctx = ggml_init(params);
			
 
				-
			
 
				-    int err = loader.load_model_weights(model, fin);
			
 
				-    if (err) return err;
			
 
				-
			
 
				-    return loader.load_layer_config(model, fin);
			
 
				-}
			
 
				-
			
 
				+extern "C" int load_fairseq2_ggml_file(fairseq2_model& model, const char* fname);
			
--- a/ggml/examples/unity/unity.cpp
+++ b/ggml/examples/unity/unity.cpp
@@ -10,7 +10,7 @@
 
				 #include <cstdio>
			
 
				 #include <cstring>
			
 
				 #include <fstream>
			
 
				-#include <map>
			
 
				+#include <unordered_map>
			
 
				 #include <string>
			
 
				 #include <vector>
			
 
				 #include <iostream>
			
--- a/ggml/examples/unity/unity_model_loader.cpp
+++ b/ggml/examples/unity/unity_model_loader.cpp
@@ -1,36 +0,0 @@
 
				-// Copyright (c) Meta Platforms, Inc. and affiliates.
			
 
				-// All rights reserved.
			
 
				-//
			
 
				-// This source code is licensed under the license found in the
			
 
				-// LICENSE file in the root directory of this source tree.
			
 
				-
			
 
				-#include "ggml/ggml.h"
			
 
				-#include "ggml/ggml-alloc.h"
			
 
				-
			
 
				-#include "common.h"
			
 
				-#include "common-ggml.h"
			
 
				-
			
 
				-#include "unity_model_loader.h"
			
 
				-
			
 
				-void unity_model_loader::load_hparams(fairseq2_model& model, std::ifstream &fin)
			
 
				-{
			
 
				-    unity_hparams* hparams = (unity_hparams*)model.hparams;
			
 
				-    read_unity_hparams(*hparams, fin);
			
 
				-    if (hparams->__end_of_hparams__ != 6877961321223123048) {
			
 
				-        throw std::invalid_argument("");
			
 
				-    }
			
 
				-}
			
 
				-
			
 
				-std::size_t
			
 
				-unity_model_loader::compute_context_size(void* raw_hparams)
			
 
				-{
			
 
				-    auto* hparams = (unity_hparams*)raw_hparams;
			
 
				-    return hparams->model_byte_size;
			
 
				-};
			
 
				-
			
 
				-extern "C" int load_unity_ggml_file(fairseq2_model& model, const char* fname) {
			
 
				-    return load_fairseq2_ggml_file<unity_model_loader>(model, fname);
			
 
				-}
			
 
				-
			
 
				-
			
 
				-
			
--- a/ggml/examples/unity/unity_model_loader.h
+++ b/ggml/examples/unity/unity_model_loader.h
@@ -1,134 +0,0 @@
 
				-// Copyright (c) Meta Platforms, Inc. and affiliates.
			
 
				-// All rights reserved.
			
 
				-//
			
 
				-// This source code is licensed under the license found in the
			
 
				-// LICENSE file in the root directory of this source tree.
			
 
				-
			
 
				-#pragma once
			
 
				-
			
 
				-#include <vector>
			
 
				-#include "model_loader.h"
			
 
				-
			
 
				-
			
 
				-struct unity_hparams {
			
 
				-    std::int64_t model_dim;
			
 
				-    std::int64_t w2v2_encoder_config__model_dim;
			
 
				-    std::int64_t w2v2_encoder_config__max_seq_len;
			
 
				-    std::int64_t w2v2_encoder_config__feature_dim;
			
 
				-    std::int64_t w2v2_encoder_config__use_fbank;
			
 
				-    double w2v2_encoder_config__first_pass_dropout_p;
			
 
				-    std::int64_t w2v2_encoder_config__layer_norm_features;
			
 
				-    // Error: Unsupported type <class 'list'> w2v2_encoder_config__feature_extractor_layer_descs;
			
 
				-    std::int64_t w2v2_encoder_config__feature_extractor_bias;
			
 
				-    std::int64_t w2v2_encoder_config__feature_extractor_layer_norm_convs;
			
 
				-    std::int64_t w2v2_encoder_config__feature_grad_scale;
			
 
				-    std::int64_t w2v2_encoder_config__num_fbank_channels;
			
 
				-    std::int64_t w2v2_encoder_config__fbank_stride;
			
 
				-    std::int64_t w2v2_encoder_config__sample_fbank_every_k;
			
 
				-    // Error: Unsupported type <class 'str'> w2v2_encoder_config__pos_encoder_type;
			
 
				-    std::int64_t w2v2_encoder_config__pos_encoder_depth;
			
 
				-    std::int64_t w2v2_encoder_config__pos_conv_kernel_size;
			
 
				-    std::int64_t w2v2_encoder_config__num_pos_conv_groups;
			
 
				-    std::int64_t w2v2_encoder_config__use_conformer;
			
 
				-    std::int64_t w2v2_encoder_config__num_encoder_layers;
			
 
				-    std::int64_t w2v2_encoder_config__num_encoder_attn_heads;
			
 
				-    std::int64_t w2v2_encoder_config__ffn_inner_dim;
			
 
				-    double w2v2_encoder_config__dropout_p;
			
 
				-    double w2v2_encoder_config__attn_dropout_p;
			
 
				-    double w2v2_encoder_config__layer_drop_p;
			
 
				-    std::int64_t w2v2_encoder_config__norm_order;
			
 
				-    std::int64_t w2v2_encoder_config__depthwise_conv_kernel_size;
			
 
				-    std::int64_t mt_model_config__model_dim;
			
 
				-    std::int64_t mt_model_config__max_seq_len;
			
 
				-    std::int64_t mt_model_config__vocabulary_size;
			
 
				-    std::int64_t mt_model_config__pad_idx;
			
 
				-    std::int64_t mt_model_config__num_encoder_layers;
			
 
				-    std::int64_t mt_model_config__num_decoder_layers;
			
 
				-    std::int64_t mt_model_config__num_encoder_attn_heads;
			
 
				-    std::int64_t mt_model_config__num_decoder_attn_heads;
			
 
				-    std::int64_t mt_model_config__ffn_inner_dim;
			
 
				-    double mt_model_config__dropout_p;
			
 
				-    std::int64_t t2u_config__model_dim;
			
 
				-    std::int64_t t2u_config__unit_max_seq_len;
			
 
				-    std::int64_t t2u_config__unit_vocabulary_size;
			
 
				-    std::int64_t t2u_config__unit_pad_idx;
			
 
				-    std::int64_t t2u_config__num_encoder_layers;
			
 
				-    std::int64_t t2u_config__num_decoder_layers;
			
 
				-    std::int64_t t2u_config__num_encoder_attn_heads;
			
 
				-    std::int64_t t2u_config__num_decoder_attn_heads;
			
 
				-    std::int64_t t2u_config__ffn_inner_dim;
			
 
				-    double t2u_config__dropout_p;
			
 
				-    std::int64_t use_text_encoder;
			
 
				-    std::int64_t use_conformer_adaptor;
			
 
				-    std::int64_t num_adaptor_layers;
			
 
				-    std::int64_t adaptor_kernel_size;
			
 
				-    std::int64_t adaptor_stride;
			
 
				-    std::int64_t adaptor_layer_norm;
			
 
				-    double adaptor_dropout_p;
			
 
				-    std::int64_t model_byte_size;
			
 
				-    std::int64_t __end_of_hparams__;
			
 
				-};
			
 
				-
			
 
				-void read_unity_hparams(unity_hparams& out, std::ifstream &fin) {
			
 
				-    fin.read((char*) &out.model_dim, sizeof(out.model_dim));
			
 
				-    fin.read((char*) &out.w2v2_encoder_config__model_dim, sizeof(out.w2v2_encoder_config__model_dim));
			
 
				-    fin.read((char*) &out.w2v2_encoder_config__max_seq_len, sizeof(out.w2v2_encoder_config__max_seq_len));
			
 
				-    fin.read((char*) &out.w2v2_encoder_config__feature_dim, sizeof(out.w2v2_encoder_config__feature_dim));
			
 
				-    fin.read((char*) &out.w2v2_encoder_config__use_fbank, sizeof(out.w2v2_encoder_config__use_fbank));
			
 
				-    fin.read((char*) &out.w2v2_encoder_config__first_pass_dropout_p, sizeof(out.w2v2_encoder_config__first_pass_dropout_p));
			
 
				-    fin.read((char*) &out.w2v2_encoder_config__layer_norm_features, sizeof(out.w2v2_encoder_config__layer_norm_features));
			
 
				-    fin.read((char*) &out.w2v2_encoder_config__feature_extractor_bias, sizeof(out.w2v2_encoder_config__feature_extractor_bias));
			
 
				-    fin.read((char*) &out.w2v2_encoder_config__feature_extractor_layer_norm_convs, sizeof(out.w2v2_encoder_config__feature_extractor_layer_norm_convs));
			
 
				-    fin.read((char*) &out.w2v2_encoder_config__feature_grad_scale, sizeof(out.w2v2_encoder_config__feature_grad_scale));
			
 
				-    fin.read((char*) &out.w2v2_encoder_config__num_fbank_channels, sizeof(out.w2v2_encoder_config__num_fbank_channels));
			
 
				-    fin.read((char*) &out.w2v2_encoder_config__fbank_stride, sizeof(out.w2v2_encoder_config__fbank_stride));
			
 
				-    fin.read((char*) &out.w2v2_encoder_config__sample_fbank_every_k, sizeof(out.w2v2_encoder_config__sample_fbank_every_k));
			
 
				-    fin.read((char*) &out.w2v2_encoder_config__pos_encoder_depth, sizeof(out.w2v2_encoder_config__pos_encoder_depth));
			
 
				-    fin.read((char*) &out.w2v2_encoder_config__pos_conv_kernel_size, sizeof(out.w2v2_encoder_config__pos_conv_kernel_size));
			
 
				-    fin.read((char*) &out.w2v2_encoder_config__num_pos_conv_groups, sizeof(out.w2v2_encoder_config__num_pos_conv_groups));
			
 
				-    fin.read((char*) &out.w2v2_encoder_config__use_conformer, sizeof(out.w2v2_encoder_config__use_conformer));
			
 
				-    fin.read((char*) &out.w2v2_encoder_config__num_encoder_layers, sizeof(out.w2v2_encoder_config__num_encoder_layers));
			
 
				-    fin.read((char*) &out.w2v2_encoder_config__num_encoder_attn_heads, sizeof(out.w2v2_encoder_config__num_encoder_attn_heads));
			
 
				-    fin.read((char*) &out.w2v2_encoder_config__ffn_inner_dim, sizeof(out.w2v2_encoder_config__ffn_inner_dim));
			
 
				-    fin.read((char*) &out.w2v2_encoder_config__dropout_p, sizeof(out.w2v2_encoder_config__dropout_p));
			
 
				-    fin.read((char*) &out.w2v2_encoder_config__attn_dropout_p, sizeof(out.w2v2_encoder_config__attn_dropout_p));
			
 
				-    fin.read((char*) &out.w2v2_encoder_config__layer_drop_p, sizeof(out.w2v2_encoder_config__layer_drop_p));
			
 
				-    fin.read((char*) &out.w2v2_encoder_config__norm_order, sizeof(out.w2v2_encoder_config__norm_order));
			
 
				-    fin.read((char*) &out.w2v2_encoder_config__depthwise_conv_kernel_size, sizeof(out.w2v2_encoder_config__depthwise_conv_kernel_size));
			
 
				-    fin.read((char*) &out.mt_model_config__model_dim, sizeof(out.mt_model_config__model_dim));
			
 
				-    fin.read((char*) &out.mt_model_config__max_seq_len, sizeof(out.mt_model_config__max_seq_len));
			
 
				-    fin.read((char*) &out.mt_model_config__vocabulary_size, sizeof(out.mt_model_config__vocabulary_size));
			
 
				-    fin.read((char*) &out.mt_model_config__pad_idx, sizeof(out.mt_model_config__pad_idx));
			
 
				-    fin.read((char*) &out.mt_model_config__num_encoder_layers, sizeof(out.mt_model_config__num_encoder_layers));
			
 
				-    fin.read((char*) &out.mt_model_config__num_decoder_layers, sizeof(out.mt_model_config__num_decoder_layers));
			
 
				-    fin.read((char*) &out.mt_model_config__num_encoder_attn_heads, sizeof(out.mt_model_config__num_encoder_attn_heads));
			
 
				-    fin.read((char*) &out.mt_model_config__num_decoder_attn_heads, sizeof(out.mt_model_config__num_decoder_attn_heads));
			
 
				-    fin.read((char*) &out.mt_model_config__ffn_inner_dim, sizeof(out.mt_model_config__ffn_inner_dim));
			
 
				-    fin.read((char*) &out.mt_model_config__dropout_p, sizeof(out.mt_model_config__dropout_p));
			
 
				-    fin.read((char*) &out.t2u_config__model_dim, sizeof(out.t2u_config__model_dim));
			
 
				-    fin.read((char*) &out.t2u_config__unit_max_seq_len, sizeof(out.t2u_config__unit_max_seq_len));
			
 
				-    fin.read((char*) &out.t2u_config__unit_vocabulary_size, sizeof(out.t2u_config__unit_vocabulary_size));
			
 
				-    fin.read((char*) &out.t2u_config__unit_pad_idx, sizeof(out.t2u_config__unit_pad_idx));
			
 
				-    fin.read((char*) &out.t2u_config__num_encoder_layers, sizeof(out.t2u_config__num_encoder_layers));
			
 
				-    fin.read((char*) &out.t2u_config__num_decoder_layers, sizeof(out.t2u_config__num_decoder_layers));
			
 
				-    fin.read((char*) &out.t2u_config__num_encoder_attn_heads, sizeof(out.t2u_config__num_encoder_attn_heads));
			
 
				-    fin.read((char*) &out.t2u_config__num_decoder_attn_heads, sizeof(out.t2u_config__num_decoder_attn_heads));
			
 
				-    fin.read((char*) &out.t2u_config__ffn_inner_dim, sizeof(out.t2u_config__ffn_inner_dim));
			
 
				-    fin.read((char*) &out.t2u_config__dropout_p, sizeof(out.t2u_config__dropout_p));
			
 
				-    fin.read((char*) &out.use_text_encoder, sizeof(out.use_text_encoder));
			
 
				-    fin.read((char*) &out.use_conformer_adaptor, sizeof(out.use_conformer_adaptor));
			
 
				-    fin.read((char*) &out.num_adaptor_layers, sizeof(out.num_adaptor_layers));
			
 
				-    fin.read((char*) &out.adaptor_kernel_size, sizeof(out.adaptor_kernel_size));
			
 
				-    fin.read((char*) &out.adaptor_stride, sizeof(out.adaptor_stride));
			
 
				-    fin.read((char*) &out.adaptor_layer_norm, sizeof(out.adaptor_layer_norm));
			
 
				-    fin.read((char*) &out.adaptor_dropout_p, sizeof(out.adaptor_dropout_p));
			
 
				-    fin.read((char*) &out.model_byte_size, sizeof(out.model_byte_size));
			
 
				-    fin.read((char*) &out.__end_of_hparams__, sizeof(out.__end_of_hparams__));
			
 
				-};
			
 
				-
			
 
				-class unity_model_loader: public model_loader {
			
 
				-    public:
			
 
				-    void load_hparams(fairseq2_model& model, std::ifstream &fin);
			
 
				-
			
 
				-    std::size_t compute_context_size(void* raw_hparams);
			
 
				-};
			
--- a/ggml/ggml.py
+++ b/ggml/ggml.py
@@ -307,14 +307,14 @@ def CppStr(content: str) -> NativeObj:
 
				     return NativeObj("std_string", cpp_str)
			
 
				 
			
 
				 
			
 
				-lib.load_unity_ggml_file.argtypes = [ctypes.c_void_p, ctypes.c_char_p]
			
 
				-lib.load_unity_ggml_file.restype = ctypes.c_int
			
 
				+lib.load_fairseq2_ggml_file.argtypes = [ctypes.c_void_p, ctypes.c_char_p]
			
 
				+lib.load_fairseq2_ggml_file.restype = ctypes.c_int
			
 
				 
			
 
				 
			
 
				-def load_unity_ggml_file(model_file: Path) -> NativeObj:
			
 
				+def load_fairseq2_ggml_file(model_file: Path) -> NativeObj:
			
 
				     model = Fairseq2Model()
			
 
				     bytes_file = ctypes.create_string_buffer(str(model_file).encode("utf-8"))
			
 
				-    err = lib.load_unity_ggml_file(model.ptr, bytes_file)
			
 
				+    err = lib.load_fairseq2_ggml_file(model.ptr, bytes_file)
			
 
				     if err:
			
 
				         raise Exception("Failed to load model")
			
 
				     return model
			
@@ -446,6 +446,12 @@ def generate_sequence(
 
				 ) -> Ptr[Hypothesis]:
			
 
				     ...
			
 
				 
			
 
				+
			
 
				 @c_fn(lib)
			
 
				 def _testing_return_hypothesis_ptr(ctx: ggml_context_p) -> Ptr[Hypothesis]:
			
 
				     return Ptr()
			
 
				+
			
 
				+
			
 
				+@c_fn(lib)
			
 
				+def fairseq2_model_layer_config_int(model: ctypes.c_void_p, name: str) -> int:
			
 
				+    pass
			
--- a/ggml/ggml_convert.py
+++ b/ggml/ggml_convert.py
@@ -68,28 +68,37 @@ def pos_enc(max_seq_len=4096, encoding_dim=1024):
 
				 
			
 
				     return weight
			
 
				 
			
 
				-def convert_model(model_name: str, out: Optional[Path] = None) -> None:
			
 
				-    if out is None:
			
 
				-        out = Path(model_name).with_suffix(".ggml")
			
 
				-
			
 
				-    # The type of model depends on the name
			
 
				-    if "unity" in model_name or "seamlessM4T" in model_name:
			
 
				-        model_config = load_unity_config(model_name)
			
 
				-        hparams = flatten_config(dataclasses.asdict(model_config), separator="__")
			
 
				-        print(hparams)
			
 
				-        model = load_unity_model(model_name)
			
 
				+def convert_model(
			
 
				+    model_name: Union[str, torch.nn.Module],
			
 
				+    out: Optional[Path] = None,
			
 
				+    hparams: Optional[Dict[str, Any]] = None,
			
 
				+) -> None:
			
 
				+    if isinstance(model_name, str):
			
 
				+        # Load the corresponding fairseq2 model
			
 
				+        if out is None:
			
 
				+            out = Path(model_name).with_suffix(".ggml")
			
 
				+
			
 
				+        # The type of model depends on the name
			
 
				+        if "unity" in model_name or "seamlessM4T" in model_name:
			
 
				+            if hparams is None:
			
 
				+                model_config = load_unity_config(model_name)
			
 
				+                hparams = flatten_config(dataclasses.asdict(model_config), separator="__")
			
 
				+                print(hparams)
			
 
				+            model = load_unity_model(model_name)
			
 
				+        else:
			
 
				+            raise ValueError(f"Unsupported model type: {model_name}")
			
 
				     else:
			
 
				-        raise ValueError(f"Unsupported model type: {model_name}")
			
 
				+        # Use the model passed explicitly
			
 
				+        assert out is not None, "output path is required when explicitly passing a module"
			
 
				+        hparams = hparams or {}
			
 
				+        model = model_name
			
 
				 
			
 
				     state_dict = model.state_dict()
			
 
				     fixup_model(model, state_dict)
			
 
				+    layer_config = read_layer_config(model)
			
 
				 
			
 
				     with out.open("wb") as o:
			
 
				-        write_ggml_file(o, hparams, state_dict)
			
 
				-        write_layer_config(o, model)
			
 
				-
			
 
				-    with out.with_suffix(".hparams.h").open("w") as h:
			
 
				-        h.write(generate_hparams_struct(hparams, "unity_hparams"))
			
 
				+        write_ggml_file(o, hparams, layer_config, state_dict)
			
 
				 
			
 
				 
			
 
				 def _nested_getattr(model: Any, name: str) -> Any:
			
@@ -120,7 +129,10 @@ def find_children(model: torch.nn.Module, t: type) -> List[Tuple[str, torch.nn.M
 
				 def fixup_model(model: torch.nn.Module, state_dict: Dict[str, torch.Tensor]) -> None:
			
 
				     # Bake the embedding scaling into the weights
			
 
				     frontends = find_children(model, TransformerEmbeddingFrontend)
			
 
				-    print("Upgrading the following TransformerEmbeddingFrontend:", [x[0] for x in frontends])
			
 
				+    print(
			
 
				+        "Upgrading the following TransformerEmbeddingFrontend:",
			
 
				+        [x[0] for x in frontends],
			
 
				+    )
			
 
				     for name, frontend in frontends:
			
 
				         embed_weights = state_dict[name + ".embed.weight"]
			
 
				         state_dict[name + ".embed.weight"] = embed_weights * frontend.scale
			
@@ -128,7 +140,10 @@ def fixup_model(model: torch.nn.Module, state_dict: Dict[str, torch.Tensor]) ->
 
				     # Sinusoidal embeddings are typically not saved since they are easily recomputed,
			
 
				     # but this allows to avoid porting the sinusoidal logic to GGML
			
 
				     pos_encoders = find_children(model, SinusoidalPositionEncoder)
			
 
				-    print("Upgrading the following SinusoidalPositionEncoder:", [x[0] for x in pos_encoders])
			
 
				+    print(
			
 
				+        "Upgrading the following SinusoidalPositionEncoder:",
			
 
				+        [x[0] for x in pos_encoders],
			
 
				+    )
			
 
				     for name, pos_encoder in pos_encoders:
			
 
				         assert isinstance(pos_encoder.weight, torch.Tensor)
			
 
				         assert name not in state_dict
			
@@ -137,29 +152,19 @@ def fixup_model(model: torch.nn.Module, state_dict: Dict[str, torch.Tensor]) ->
 
				     state_dict["speech_encoder.pos_enc"] = pos_enc()
			
 
				 
			
 
				 def write_ggml_file(
			
 
				-    out: BufferedWriter, hparams: Dict[str, Any], state_dict: Dict[str, torch.Tensor]
			
 
				+    out: BufferedWriter,
			
 
				+    hparams: Dict[str, Any],
			
 
				+    layer_config: Dict[str, Any],
			
 
				+    state_dict: Dict[str, torch.Tensor],
			
 
				 ) -> None:
			
 
				     write_ggml_header(out)
			
 
				-
			
 
				-    # Apppend the byte size to the hparams.
			
 
				-    if "model_byte_size" not in hparams:
			
 
				-        # Size of each tensor
			
 
				-        byte_size = sum(x.numel() * x.element_size() for x in state_dict.values())
			
 
				-        # + tensor overhead
			
 
				-        byte_size += ggml.ggml_tensor_overhead() * (len(state_dict) + 10)
			
 
				-        hparams["model_byte_size"] = byte_size
			
 
				-        logging.warning(
			
 
				-            f"Saving a ggml file with {len(state_dict)} tensors, for an estimated amount of {byte_size / (1024**3)} GGML Gb"
			
 
				-        )
			
 
				-    # 6877961321223123048
			
 
				-    hparams["__end_of_hparams__"] = struct.unpack("l", b"hparams_")[0]
			
 
				-
			
 
				     write_hparams(out, hparams)
			
 
				+    write_hparams(out, layer_config)
			
 
				     write_state_dict(out, state_dict)
			
 
				 
			
 
				 
			
 
				 def write_ggml_header(out: BufferedWriter) -> None:
			
 
				-    """Write GGML header (in reverse cause why not)"""
			
 
				+    """Write GGML header (in reverse cause big-endian)"""
			
 
				     out.write(b"ggml"[::-1])
			
 
				 
			
 
				 
			
@@ -170,17 +175,23 @@ def write_hparams(out: BufferedWriter, hparams: Dict[str, Any]) -> None:
 
				         flattened dict containing model's hyper parameters.
			
 
				 
			
 
				     """
			
 
				-    # TODO: should we preprend the size of the hparams struct ?
			
 
				-    # this would help catch out of sync writer/loader code
			
 
				+    simple_vals = {}
			
 
				     for key, value in hparams.items():
			
 
				         try:
			
 
				-            # TODO: this is not cross platform, what's the standard way of writing hparams in GGML ?
			
 
				-            ctype, cvalue = to_ctype(value)
			
 
				-            out.write(struct.pack(ctype, cvalue))
			
 
				+            simple_vals[key] = to_ctype(value)
			
 
				         except ValueError:
			
 
				             logging.warning(f"Skipping config for key {key}={value!r}")
			
 
				             continue
			
 
				 
			
 
				+    out.write(struct.pack("<q", len(simple_vals)))
			
 
				+    for key, (ctype, cvalue) in simple_vals.items():
			
 
				+        write_string(out, key)
			
 
				+        b = struct.pack(ctype, cvalue)
			
 
				+        assert len(b) == 8
			
 
				+        out.write(b)
			
 
				+
			
 
				+    logging.info(f"Saved {len(simple_vals)} params.")
			
 
				+
			
 
				 
			
 
				 def write_state_dict(out: BufferedWriter, state_dict: Dict[str, torch.Tensor]) -> None:
			
 
				     """Write pytorch state dict.
			
@@ -188,7 +199,15 @@ def write_state_dict(out: BufferedWriter, state_dict: Dict[str, torch.Tensor]) -
 
				     :paras state_dict:
			
 
				         state dict returned by pytorch model
			
 
				     """
			
 
				-    out.write(struct.pack("i", len(state_dict)))
			
 
				+    out.write(struct.pack("<q", len(state_dict)))
			
 
				+    # Size of each tensor
			
 
				+    byte_size = sum(x.numel() * x.element_size() for x in state_dict.values())
			
 
				+    # + tensor overhead
			
 
				+    byte_size += ggml.ggml_tensor_overhead() * (len(state_dict) + 10)
			
 
				+    out.write(struct.pack("<q", byte_size))
			
 
				+    logging.warning(
			
 
				+        f"Saving a ggml file with {len(state_dict)} tensors, for an estimated amount of {byte_size / (1024**3):.3f} GGML Gb"
			
 
				+    )
			
 
				     for key, value in state_dict.items():
			
 
				         write_string(out, key)
			
 
				         if key.endswith(".bias") and value.ndim == 1 and "adaptor" not in key:
			
@@ -201,27 +220,6 @@ def write_state_dict(out: BufferedWriter, state_dict: Dict[str, torch.Tensor]) -
 
				         write_tensor(out, value.contiguous())
			
 
				 
			
 
				 
			
 
				-def write_layer_config(out: BufferedWriter, model: torch.nn.Module) -> None:
			
 
				-    for name, node in find_children(model, torch.nn.Module):
			
 
				-        for k, v in node.__dict__.items():
			
 
				-            # Skip special members. In particular all children module and tensors
			
 
				-            # will be hidden in special dicts `_parameters` and `_modules`
			
 
				-            if k.startswith("_"):
			
 
				-                continue
			
 
				-            # All modules have a "training" flag
			
 
				-            if k == "training":
			
 
				-                continue
			
 
				-            if v is None:
			
 
				-                continue
			
 
				-            try:
			
 
				-                ctype, cvalue = to_ctype(v)
			
 
				-                write_string(out, f"{name}.{k}")
			
 
				-                out.write(struct.pack(ctype, cvalue))
			
 
				-            except ValueError as e:
			
 
				-                logging.warning(f"Skipping config for {name}.{k}={v!r}")
			
 
				-                continue
			
 
				-
			
 
				-
			
 
				 def write_string(out: BufferedWriter, value: str) -> None:
			
 
				     """Write string in utf-8 format.
			
 
				 
			
@@ -229,7 +227,9 @@ def write_string(out: BufferedWriter, value: str) -> None:
 
				         string value to dump.
			
 
				     """
			
 
				     str_ = value.encode("utf-8")
			
 
				-    out.write(struct.pack("i", len(str_)))
			
 
				+    packed_len = struct.pack("<i", len(str_))
			
 
				+    assert len(packed_len) == 4
			
 
				+    out.write(packed_len)
			
 
				     out.write(str_)
			
 
				 
			
 
				 
			
@@ -243,7 +243,7 @@ def write_tensor(out: BufferedWriter, value: torch.Tensor) -> None:
 
				         Tensor to dump.
			
 
				     """
			
 
				     if value.dtype is torch.int64:
			
 
				-        # GGML doesn't ahve int64, downcast it
			
 
				+        # GGML doesn't have int64, downcast it
			
 
				         value = value.to(dtype=torch.int32)
			
 
				 
			
 
				     if value.ndim == 0:
			
@@ -256,11 +256,11 @@ def write_tensor(out: BufferedWriter, value: torch.Tensor) -> None:
 
				     assert n_dims >= 1, "ggml doesn't support 0 dim tensors"
			
 
				 
			
 
				     ftype = torch_to_ggml_type(value.dtype)
			
 
				-    out.write(struct.pack("i", n_dims))
			
 
				-    out.write(struct.pack("i", ftype))
			
 
				+    out.write(struct.pack("<i", n_dims))
			
 
				+    out.write(struct.pack("<i", ftype))
			
 
				     for i in range(n_dims):
			
 
				         # ggml uses long for shape
			
 
				-        out.write(struct.pack("l", data.shape[n_dims - 1 - i]))
			
 
				+        out.write(struct.pack("<q", data.shape[n_dims - 1 - i]))
			
 
				 
			
 
				     data.tofile(out)
			
 
				 
			
@@ -314,9 +314,40 @@ def flatten_config(
 
				     return __flatten(config)
			
 
				 
			
 
				 
			
 
				+def read_layer_config(model: torch.nn.Module) -> Dict[str, Any]:
			
 
				+    layer_config = {}
			
 
				+
			
 
				+    def _append_node_config(node: Any, prefix: str) -> None:
			
 
				+        for k, v in node.__dict__.items():
			
 
				+            # Skip special members. In particular all children module and tensors
			
 
				+            # will be hidden in special dicts `_parameters` and `_modules`
			
 
				+            if k.startswith("_"):
			
 
				+                continue
			
 
				+            # All modules have a "training" flag
			
 
				+            if k == "training":
			
 
				+                continue
			
 
				+            if v is None:
			
 
				+                continue
			
 
				+
			
 
				+            try:
			
 
				+                to_ctype(v)
			
 
				+            except ValueError:
			
 
				+                logging.warning(f"Skipping layer config {k}={v!r}")
			
 
				+                continue
			
 
				+            layer_config[prefix + k] = v
			
 
				+
			
 
				+    _append_node_config(model, "")
			
 
				+    for name, node in find_children(model, torch.nn.Module):
			
 
				+        _append_node_config(node, name + ".")
			
 
				+    return layer_config
			
 
				+
			
 
				+
			
 
				 def to_ctype(value: Any) -> Tuple[str, Any]:
			
 
				     """Transform python type to ctype.
			
 
				 
			
 
				+    Note: we always use little-endian and 8-byte types.
			
 
				+    This make the format independent of the current platform.
			
 
				+
			
 
				     :params value:
			
 
				         value to cast into ctype
			
 
				 
			
@@ -324,20 +355,20 @@ def to_ctype(value: Any) -> Tuple[str, Any]:
 
				         A tuple of ctype and cvalue.
			
 
				     """
			
 
				     if isinstance(value, int):
			
 
				-        return ("l", value)
			
 
				+        return ("<q", value)
			
 
				     if isinstance(value, float):
			
 
				-        return ("d", value)
			
 
				+        return ("<d", value)
			
 
				     if isinstance(value, bool):
			
 
				-        return ("l", value)
			
 
				+        return ("<q", value)
			
 
				     if isinstance(value, Enum):
			
 
				-        return ("l", value.value)
			
 
				+        return ("<q", value.value)
			
 
				     if isinstance(value, tuple) and len(value) == 1:
			
 
				         return to_ctype(value[0])
			
 
				     if isinstance(value, str) and len(value) < 8:
			
 
				         value = bytes(value, "ascii")
			
 
				         if len(value) < 8:
			
 
				             value = value + (8 - len(value)) * b"\0"
			
 
				-        return ("l", struct.unpack("l", value)[0])
			
 
				+        return ("8s", value)
			
 
				 
			
 
				     raise ValueError(f"Unsupported type {type(value)}")
			
 
				 
			
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -251,7 +251,6 @@ add_library(${TARGET}
 
				     ggml-alloc.c
			
 
				     ../examples/unity/fairseq2.cpp
			
 
				     ../examples/unity/model_loader.cpp
			
 
				-    ../examples/unity/unity_model_loader.cpp
			
 
				     ../examples/unity/unity.cpp
			
 
				     ../examples/common.cpp
			
 
				     ../examples/common-ggml.cpp
			
--- a/ggml/test_unity_cpp.py
+++ b/ggml/test_unity_cpp.py
@@ -16,7 +16,7 @@ from typing import Any
 
				 from pathlib import Path
			
 
				 from typing import Iterator
			
 
				 from ggml import NativeObj
			
 
				-from ggml_convert import convert_model
			
 
				+from ggml_convert import convert_model, read_layer_config
			
 
				 from seamless_communication.models.inference.translator import Translator, Modality
			
 
				 from fairseq2.data.audio import WaveformToFbankConverter
			
 
				 import torchaudio
			
@@ -46,7 +46,7 @@ def _load_g_model_once() -> NativeObj:
 
				     model_file = Path(__file__).parent / "seamlessM4T_medium.ggml"
			
 
				     if not model_file.exists():
			
 
				         convert_model("seamlessM4T_medium", model_file)
			
 
				-    return ggml.load_unity_ggml_file(model_file)
			
 
				+    return ggml.load_fairseq2_ggml_file(model_file)
			
 
				 
			
 
				 @pytest.fixture()
			
 
				 def g_model(ctx: Ctx) -> c_void_p:
			
@@ -65,14 +65,18 @@ def load_pt_model() -> Any:
 
				     return load_translator().model
			
 
				 
			
 
				 
			
 
				-@pytest.mark.xfail(reason="TODO")
			
 
				-def test_hparams_code_is_up_to_date() -> None:
			
 
				-    model_file = Path(__file__).parent / "seamlessM4T_medium.ggml"
			
 
				+def test_convert_linear(tmp_path: Path) -> None:
			
 
				+    module = fairseq2.nn.Linear(16, 24, True)
			
 
				+
			
 
				+    layer_config = read_layer_config(module)
			
 
				+    assert layer_config == {"input_dim": 16, "output_dim": 24, "skip_init": False}
			
 
				+
			
 
				+    module_file = Path("module.ggml")
			
 
				+    convert_model(module, module_file)
			
 
				+    g_module = ggml.load_fairseq2_ggml_file(module_file)
			
 
				 
			
 
				-    hparams_header_file = model_file.with_suffix(".hparams.h")
			
 
				-    hparams_struct = hparams_header_file.read_text().strip()
			
 
				-    actual_code = (UNITY_MODELS.parent / "unity_model_loader.h").read_text()
			
 
				-    assert hparams_struct in actual_code
			
 
				+    for k, v in layer_config.items():
			
 
				+        assert ggml.fairseq2_model_layer_config_int(g_module.ptr, bytes(k, "ascii")) == v
			
 
				 
			
 
				 
			
 
				 def test_causal_attention_mask(ctx: Ctx):