Quellcode durchsuchen

force little-endian

Guillaume Wenzek vor 1 Jahr
Ursprung
Commit
eb7810b81f

+ 0 - 1
ggml/examples/unity/CMakeLists.txt

@@ -7,5 +7,4 @@ target_sources(unity
     PRIVATE
         fairseq2.cpp
         model_loader.cpp
-        unity_model_loader.cpp
 )

+ 8 - 6
ggml/examples/unity/fairseq2.cpp

@@ -12,25 +12,27 @@
 extern "C" fairseq2_model* fairseq2_model_alloc() {
     // pre-allocate some memory to write hyperparameters and tensors pointers
     auto* model = new fairseq2_model;
-    model->hparams = new std::uint8_t[8 * 1024];
     model->tensors_ctx = nullptr;
     return model;
 }
 
 
-double fairseq2_model_layer_config_double(const fairseq2_model& model, std::string name) {
+inline double model_layer_config_d(const fairseq2_model& model, std::string name) {
     const std::int64_t* data = &model.layer_config.at(name);
     return *(double*)data;
 }
 
-std::int64_t fairseq2_model_layer_config_int(const fairseq2_model& model, std::string name) {
-    return model.layer_config.at(name);
+extern "C" double fairseq2_model_layer_config_double(const fairseq2_model& model, const char* name) {
+    return model_layer_config_d(model, std::string(name));
+}
+
+extern "C" std::int64_t fairseq2_model_layer_config_int(const fairseq2_model& model, const char* name) {
+    return model.layer_config.at(std::string(name));
 }
 
 
 extern "C" void fairseq2_model_free(fairseq2_model* model) {
     if (model->tensors_ctx) ggml_free(model->tensors_ctx);
-    delete (std::uint8_t*)model->hparams;
     delete model;
 }
 
@@ -77,7 +79,7 @@ extern "C" ggml_tensor* LayerNorm_forward(
     GGML_ASSERT(bias != nullptr);
 
     auto ctx = model.ctx;
-    double eps = fairseq2_model_layer_config_double(model, prefix + ".eps");
+    double eps = model_layer_config_d(model, prefix + ".eps");
 
     input = ggml_norm(ctx, input, /*eps*/eps);
     return ggml_add_inplace(

+ 12 - 4
ggml/examples/unity/fairseq2.h

@@ -1,6 +1,6 @@
 #pragma once
 
-#include <map>
+#include <unordered_map>
 #include <string>
 #include <vector>
 #include "ggml.h"
@@ -10,10 +10,17 @@
 struct fairseq2_model {
     // Context containing all tensors memory
     ggml_context* tensors_ctx;
+
     // Named tensors, all tensors should belong to tensors_ctx
-    std::map<std::string, struct ggml_tensor *> tensors;
-    std::map<std::string, std::int64_t> layer_config;
-    void* hparams;
+    std::unordered_map<std::string, struct ggml_tensor *> tensors;
+
+    // Hashmap containing model hyper-parameters.
+    std::unordered_map<std::string, std::int64_t> hparams;
+
+    // Hashmap containing layers hyper-parameters.
+    // Normally those can be inferred from hparams, but it avoids doing this logic in GGML
+    std::unordered_map<std::string, std::int64_t> layer_config;
+
     // an inference context, not managed by this object
     // TODO: is this the best place to store this or should we also pass this to all forward methods ?
     ggml_context* ctx;
@@ -147,6 +154,7 @@ extern "C" ggml_tensor* StandardConformerEncoderAdaptor_forward(
     ggml_tensor* padding_mask
 );
 // Specifies the Layer Normalization order.
+// see fairseq2/nn/transformer/norm_order.py
 enum TransformerNormOrder {
     TRANSFORMER_NORM_ORDER_POST = 0,
     TRANSFORMER_NORM_ORDER_PRE = 1,

+ 66 - 16
ggml/examples/unity/model_loader.cpp

@@ -35,12 +35,22 @@ void register_prefix(fairseq2_model &model, const std::string& name) {
 }
 
 
-int
+std::int64_t
 model_loader::load_model_weights(fairseq2_model &model, std::ifstream &fin)
 {
-    int num_tensor = 0;
+    std::int64_t num_tensor = 0;
+    std::int64_t ctx_size = 0;
     fin.read((char*) &num_tensor, sizeof(num_tensor));
-    size_t total_size = 0;
+    fin.read((char*) &ctx_size, sizeof(ctx_size));
+
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ static_cast<std::size_t>(ctx_size),
+        /*.mem_buffer =*/ NULL,
+        /*.no_alloc   =*/ false,
+    };
+    model.tensors_ctx = ggml_init(params);
+
+    size_t model_size = 0;
     for (int i = 0; i < num_tensor; ++i) {
         std::string name = get_name(fin);
         if (name.length() == 0)
@@ -49,7 +59,7 @@ model_loader::load_model_weights(fairseq2_model &model, std::ifstream &fin)
         if (tensor == nullptr) {
             // Abort in case of error, the input stream is corrupted at this point.
             printf("Error while reading tensor %s\n", name.c_str() );
-            return 1;
+            throw std::invalid_argument("Error while reading tensor from file.");
         }
         register_prefix(model, name);
         ggml_set_name(tensor, name.c_str());
@@ -57,27 +67,57 @@ model_loader::load_model_weights(fairseq2_model &model, std::ifstream &fin)
         if (DEBUG_MODEL_LOAD) {
             printf("%s [%5ld, %5ld], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), tensor->ne[0], tensor->ne[1], ggml_type_name(tensor->type), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
         }
-        total_size += ggml_nbytes(tensor);
+        model_size += ggml_nbytes(tensor);
     }
 
-    printf("%s: model size  = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
-    return 0;
+    double mb = 1024.0 * 1024.0;
+    printf("%s: model size  = %8.2f MB, memory used = %8.2f MB, memory reserved = %8.2f \n",
+        __func__,
+        model_size / mb,
+        ggml_used_mem(model.tensors_ctx) / mb,
+        ctx_size / mb
+    );
+
+    return ctx_size;
+}
+
+void assert_endianness() {
+    union {
+        unsigned int i;
+        char c[4];
+    } un;
+    un.i = 0x12345678;
+
+    if (un.c[0] == 0x78 && un.c[3] == 0x12) {
+        printf("little-endian\n");
+    }
+    else if (un.c[0] == 0x12 && un.c[3] == 0x78) {
+        printf("big-endian\n");
+        GGML_ASSERT(false); // model_loader.cpp assumes the system is little-endian
+    }
+    else {
+        printf("unknown-endian\n");
+        GGML_ASSERT(false); // model_loader.cpp assumes the system is little-endian
+    }
 }
 
 
-int
-model_loader::load_layer_config(fairseq2_model &model, std::ifstream &fin)
+void model_loader::load_hparams(std::unordered_map<std::string, std::int64_t>& hparams, std::ifstream &fin)
 {
+    std::int64_t num_params = 0;
+    fin.read(reinterpret_cast<char*>(&num_params), sizeof num_params);
+    GGML_ASSERT(fin.gcount() == 8);
+
+    hparams.reserve(num_params);
+
     std::int64_t value;
-    while (!fin.eof()) {
+    for (int i = 0; i < num_params; ++i) {
         std::string name = get_name(fin);
         if (name.length() == 0)
             break;
         fin.read((char*) &value, sizeof(value));
-        model.layer_config[name] = value;
+        hparams[name] = value;
     }
-
-    return 0;
 }
 
 ggml_tensor* load_tensor_value(std::ifstream &fin, ggml_context* ctx)
@@ -107,11 +147,21 @@ model_loader::get_name(std::ifstream& fin)
 {
     std::uint32_t length = 0;
     fin.read(reinterpret_cast<char *>(&length), sizeof(length));
+    if (length == 0)
+        return "";
+
     std::string name(length, 0);
-    if (length == 0) {
-        return name;
-    };
     fin.read(&name[0], length);
 
     return name;
 }
+
+extern "C" int load_fairseq2_ggml_file(fairseq2_model& model, const char* fname) {
+    model_loader loader;
+    assert_endianness();
+    auto fin = open_ggml_file(fname);
+    loader.load_hparams(model.hparams, fin);
+    loader.load_hparams(model.layer_config, fin);
+    loader.load_model_weights(model, fin);
+    return 0;
+}

+ 3 - 28
ggml/examples/unity/model_loader.h

@@ -19,15 +19,9 @@
 
 class model_loader {
 public:
-    virtual ~model_loader() {};
+    std::int64_t load_model_weights(fairseq2_model &model, std::ifstream &fin);
 
-    virtual void load_hparams(fairseq2_model& model, std::ifstream &fin) = 0;
-
-    virtual std::size_t compute_context_size(void *raw_hparams) = 0;
-
-    int load_model_weights(fairseq2_model &model, std::ifstream &fin);
-
-    int load_layer_config(fairseq2_model &model, std::ifstream &fin);
+    void load_hparams(std::unordered_map<std::string, std::int64_t>& hparams, std::ifstream &fin);
 
 private:
     ggml_tensor * next_tensor(std::ifstream &fin, fairseq2_model &model);
@@ -39,23 +33,4 @@ ggml_tensor* load_tensor_value(std::ifstream &fin, ggml_context* ctx);
 
 std::ifstream open_ggml_file(const char* fname);
 
-template<typename T>
-int load_fairseq2_ggml_file(fairseq2_model& model, const char* fname) {
-    T loader;
-    auto fin = open_ggml_file(fname);
-    loader.load_hparams(model, fin);
-
-    std::size_t ctx_size = loader.compute_context_size(model.hparams);
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ ctx_size,
-        /*.mem_buffer =*/ NULL,
-        /*.no_alloc   =*/ false,
-    };
-    model.tensors_ctx = ggml_init(params);
-
-    int err = loader.load_model_weights(model, fin);
-    if (err) return err;
-
-    return loader.load_layer_config(model, fin);
-}
-
+extern "C" int load_fairseq2_ggml_file(fairseq2_model& model, const char* fname);

+ 1 - 1
ggml/examples/unity/unity.cpp

@@ -10,7 +10,7 @@
 #include <cstdio>
 #include <cstring>
 #include <fstream>
-#include <map>
+#include <unordered_map>
 #include <string>
 #include <vector>
 #include <iostream>

+ 0 - 36
ggml/examples/unity/unity_model_loader.cpp

@@ -1,36 +0,0 @@
-// Copyright (c) Meta Platforms, Inc. and affiliates.
-// All rights reserved.
-//
-// This source code is licensed under the license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include "ggml/ggml.h"
-#include "ggml/ggml-alloc.h"
-
-#include "common.h"
-#include "common-ggml.h"
-
-#include "unity_model_loader.h"
-
-void unity_model_loader::load_hparams(fairseq2_model& model, std::ifstream &fin)
-{
-    unity_hparams* hparams = (unity_hparams*)model.hparams;
-    read_unity_hparams(*hparams, fin);
-    if (hparams->__end_of_hparams__ != 6877961321223123048) {
-        throw std::invalid_argument("");
-    }
-}
-
-std::size_t
-unity_model_loader::compute_context_size(void* raw_hparams)
-{
-    auto* hparams = (unity_hparams*)raw_hparams;
-    return hparams->model_byte_size;
-};
-
-extern "C" int load_unity_ggml_file(fairseq2_model& model, const char* fname) {
-    return load_fairseq2_ggml_file<unity_model_loader>(model, fname);
-}
-
-
-

+ 0 - 134
ggml/examples/unity/unity_model_loader.h

@@ -1,134 +0,0 @@
-// Copyright (c) Meta Platforms, Inc. and affiliates.
-// All rights reserved.
-//
-// This source code is licensed under the license found in the
-// LICENSE file in the root directory of this source tree.
-
-#pragma once
-
-#include <vector>
-#include "model_loader.h"
-
-
-struct unity_hparams {
-    std::int64_t model_dim;
-    std::int64_t w2v2_encoder_config__model_dim;
-    std::int64_t w2v2_encoder_config__max_seq_len;
-    std::int64_t w2v2_encoder_config__feature_dim;
-    std::int64_t w2v2_encoder_config__use_fbank;
-    double w2v2_encoder_config__first_pass_dropout_p;
-    std::int64_t w2v2_encoder_config__layer_norm_features;
-    // Error: Unsupported type <class 'list'> w2v2_encoder_config__feature_extractor_layer_descs;
-    std::int64_t w2v2_encoder_config__feature_extractor_bias;
-    std::int64_t w2v2_encoder_config__feature_extractor_layer_norm_convs;
-    std::int64_t w2v2_encoder_config__feature_grad_scale;
-    std::int64_t w2v2_encoder_config__num_fbank_channels;
-    std::int64_t w2v2_encoder_config__fbank_stride;
-    std::int64_t w2v2_encoder_config__sample_fbank_every_k;
-    // Error: Unsupported type <class 'str'> w2v2_encoder_config__pos_encoder_type;
-    std::int64_t w2v2_encoder_config__pos_encoder_depth;
-    std::int64_t w2v2_encoder_config__pos_conv_kernel_size;
-    std::int64_t w2v2_encoder_config__num_pos_conv_groups;
-    std::int64_t w2v2_encoder_config__use_conformer;
-    std::int64_t w2v2_encoder_config__num_encoder_layers;
-    std::int64_t w2v2_encoder_config__num_encoder_attn_heads;
-    std::int64_t w2v2_encoder_config__ffn_inner_dim;
-    double w2v2_encoder_config__dropout_p;
-    double w2v2_encoder_config__attn_dropout_p;
-    double w2v2_encoder_config__layer_drop_p;
-    std::int64_t w2v2_encoder_config__norm_order;
-    std::int64_t w2v2_encoder_config__depthwise_conv_kernel_size;
-    std::int64_t mt_model_config__model_dim;
-    std::int64_t mt_model_config__max_seq_len;
-    std::int64_t mt_model_config__vocabulary_size;
-    std::int64_t mt_model_config__pad_idx;
-    std::int64_t mt_model_config__num_encoder_layers;
-    std::int64_t mt_model_config__num_decoder_layers;
-    std::int64_t mt_model_config__num_encoder_attn_heads;
-    std::int64_t mt_model_config__num_decoder_attn_heads;
-    std::int64_t mt_model_config__ffn_inner_dim;
-    double mt_model_config__dropout_p;
-    std::int64_t t2u_config__model_dim;
-    std::int64_t t2u_config__unit_max_seq_len;
-    std::int64_t t2u_config__unit_vocabulary_size;
-    std::int64_t t2u_config__unit_pad_idx;
-    std::int64_t t2u_config__num_encoder_layers;
-    std::int64_t t2u_config__num_decoder_layers;
-    std::int64_t t2u_config__num_encoder_attn_heads;
-    std::int64_t t2u_config__num_decoder_attn_heads;
-    std::int64_t t2u_config__ffn_inner_dim;
-    double t2u_config__dropout_p;
-    std::int64_t use_text_encoder;
-    std::int64_t use_conformer_adaptor;
-    std::int64_t num_adaptor_layers;
-    std::int64_t adaptor_kernel_size;
-    std::int64_t adaptor_stride;
-    std::int64_t adaptor_layer_norm;
-    double adaptor_dropout_p;
-    std::int64_t model_byte_size;
-    std::int64_t __end_of_hparams__;
-};
-
-void read_unity_hparams(unity_hparams& out, std::ifstream &fin) {
-    fin.read((char*) &out.model_dim, sizeof(out.model_dim));
-    fin.read((char*) &out.w2v2_encoder_config__model_dim, sizeof(out.w2v2_encoder_config__model_dim));
-    fin.read((char*) &out.w2v2_encoder_config__max_seq_len, sizeof(out.w2v2_encoder_config__max_seq_len));
-    fin.read((char*) &out.w2v2_encoder_config__feature_dim, sizeof(out.w2v2_encoder_config__feature_dim));
-    fin.read((char*) &out.w2v2_encoder_config__use_fbank, sizeof(out.w2v2_encoder_config__use_fbank));
-    fin.read((char*) &out.w2v2_encoder_config__first_pass_dropout_p, sizeof(out.w2v2_encoder_config__first_pass_dropout_p));
-    fin.read((char*) &out.w2v2_encoder_config__layer_norm_features, sizeof(out.w2v2_encoder_config__layer_norm_features));
-    fin.read((char*) &out.w2v2_encoder_config__feature_extractor_bias, sizeof(out.w2v2_encoder_config__feature_extractor_bias));
-    fin.read((char*) &out.w2v2_encoder_config__feature_extractor_layer_norm_convs, sizeof(out.w2v2_encoder_config__feature_extractor_layer_norm_convs));
-    fin.read((char*) &out.w2v2_encoder_config__feature_grad_scale, sizeof(out.w2v2_encoder_config__feature_grad_scale));
-    fin.read((char*) &out.w2v2_encoder_config__num_fbank_channels, sizeof(out.w2v2_encoder_config__num_fbank_channels));
-    fin.read((char*) &out.w2v2_encoder_config__fbank_stride, sizeof(out.w2v2_encoder_config__fbank_stride));
-    fin.read((char*) &out.w2v2_encoder_config__sample_fbank_every_k, sizeof(out.w2v2_encoder_config__sample_fbank_every_k));
-    fin.read((char*) &out.w2v2_encoder_config__pos_encoder_depth, sizeof(out.w2v2_encoder_config__pos_encoder_depth));
-    fin.read((char*) &out.w2v2_encoder_config__pos_conv_kernel_size, sizeof(out.w2v2_encoder_config__pos_conv_kernel_size));
-    fin.read((char*) &out.w2v2_encoder_config__num_pos_conv_groups, sizeof(out.w2v2_encoder_config__num_pos_conv_groups));
-    fin.read((char*) &out.w2v2_encoder_config__use_conformer, sizeof(out.w2v2_encoder_config__use_conformer));
-    fin.read((char*) &out.w2v2_encoder_config__num_encoder_layers, sizeof(out.w2v2_encoder_config__num_encoder_layers));
-    fin.read((char*) &out.w2v2_encoder_config__num_encoder_attn_heads, sizeof(out.w2v2_encoder_config__num_encoder_attn_heads));
-    fin.read((char*) &out.w2v2_encoder_config__ffn_inner_dim, sizeof(out.w2v2_encoder_config__ffn_inner_dim));
-    fin.read((char*) &out.w2v2_encoder_config__dropout_p, sizeof(out.w2v2_encoder_config__dropout_p));
-    fin.read((char*) &out.w2v2_encoder_config__attn_dropout_p, sizeof(out.w2v2_encoder_config__attn_dropout_p));
-    fin.read((char*) &out.w2v2_encoder_config__layer_drop_p, sizeof(out.w2v2_encoder_config__layer_drop_p));
-    fin.read((char*) &out.w2v2_encoder_config__norm_order, sizeof(out.w2v2_encoder_config__norm_order));
-    fin.read((char*) &out.w2v2_encoder_config__depthwise_conv_kernel_size, sizeof(out.w2v2_encoder_config__depthwise_conv_kernel_size));
-    fin.read((char*) &out.mt_model_config__model_dim, sizeof(out.mt_model_config__model_dim));
-    fin.read((char*) &out.mt_model_config__max_seq_len, sizeof(out.mt_model_config__max_seq_len));
-    fin.read((char*) &out.mt_model_config__vocabulary_size, sizeof(out.mt_model_config__vocabulary_size));
-    fin.read((char*) &out.mt_model_config__pad_idx, sizeof(out.mt_model_config__pad_idx));
-    fin.read((char*) &out.mt_model_config__num_encoder_layers, sizeof(out.mt_model_config__num_encoder_layers));
-    fin.read((char*) &out.mt_model_config__num_decoder_layers, sizeof(out.mt_model_config__num_decoder_layers));
-    fin.read((char*) &out.mt_model_config__num_encoder_attn_heads, sizeof(out.mt_model_config__num_encoder_attn_heads));
-    fin.read((char*) &out.mt_model_config__num_decoder_attn_heads, sizeof(out.mt_model_config__num_decoder_attn_heads));
-    fin.read((char*) &out.mt_model_config__ffn_inner_dim, sizeof(out.mt_model_config__ffn_inner_dim));
-    fin.read((char*) &out.mt_model_config__dropout_p, sizeof(out.mt_model_config__dropout_p));
-    fin.read((char*) &out.t2u_config__model_dim, sizeof(out.t2u_config__model_dim));
-    fin.read((char*) &out.t2u_config__unit_max_seq_len, sizeof(out.t2u_config__unit_max_seq_len));
-    fin.read((char*) &out.t2u_config__unit_vocabulary_size, sizeof(out.t2u_config__unit_vocabulary_size));
-    fin.read((char*) &out.t2u_config__unit_pad_idx, sizeof(out.t2u_config__unit_pad_idx));
-    fin.read((char*) &out.t2u_config__num_encoder_layers, sizeof(out.t2u_config__num_encoder_layers));
-    fin.read((char*) &out.t2u_config__num_decoder_layers, sizeof(out.t2u_config__num_decoder_layers));
-    fin.read((char*) &out.t2u_config__num_encoder_attn_heads, sizeof(out.t2u_config__num_encoder_attn_heads));
-    fin.read((char*) &out.t2u_config__num_decoder_attn_heads, sizeof(out.t2u_config__num_decoder_attn_heads));
-    fin.read((char*) &out.t2u_config__ffn_inner_dim, sizeof(out.t2u_config__ffn_inner_dim));
-    fin.read((char*) &out.t2u_config__dropout_p, sizeof(out.t2u_config__dropout_p));
-    fin.read((char*) &out.use_text_encoder, sizeof(out.use_text_encoder));
-    fin.read((char*) &out.use_conformer_adaptor, sizeof(out.use_conformer_adaptor));
-    fin.read((char*) &out.num_adaptor_layers, sizeof(out.num_adaptor_layers));
-    fin.read((char*) &out.adaptor_kernel_size, sizeof(out.adaptor_kernel_size));
-    fin.read((char*) &out.adaptor_stride, sizeof(out.adaptor_stride));
-    fin.read((char*) &out.adaptor_layer_norm, sizeof(out.adaptor_layer_norm));
-    fin.read((char*) &out.adaptor_dropout_p, sizeof(out.adaptor_dropout_p));
-    fin.read((char*) &out.model_byte_size, sizeof(out.model_byte_size));
-    fin.read((char*) &out.__end_of_hparams__, sizeof(out.__end_of_hparams__));
-};
-
-class unity_model_loader: public model_loader {
-    public:
-    void load_hparams(fairseq2_model& model, std::ifstream &fin);
-
-    std::size_t compute_context_size(void* raw_hparams);
-};

+ 10 - 4
ggml/ggml.py

@@ -307,14 +307,14 @@ def CppStr(content: str) -> NativeObj:
     return NativeObj("std_string", cpp_str)
 
 
-lib.load_unity_ggml_file.argtypes = [ctypes.c_void_p, ctypes.c_char_p]
-lib.load_unity_ggml_file.restype = ctypes.c_int
+lib.load_fairseq2_ggml_file.argtypes = [ctypes.c_void_p, ctypes.c_char_p]
+lib.load_fairseq2_ggml_file.restype = ctypes.c_int
 
 
-def load_unity_ggml_file(model_file: Path) -> NativeObj:
+def load_fairseq2_ggml_file(model_file: Path) -> NativeObj:
     model = Fairseq2Model()
     bytes_file = ctypes.create_string_buffer(str(model_file).encode("utf-8"))
-    err = lib.load_unity_ggml_file(model.ptr, bytes_file)
+    err = lib.load_fairseq2_ggml_file(model.ptr, bytes_file)
     if err:
         raise Exception("Failed to load model")
     return model
@@ -446,6 +446,12 @@ def generate_sequence(
 ) -> Ptr[Hypothesis]:
     ...
 
+
 @c_fn(lib)
 def _testing_return_hypothesis_ptr(ctx: ggml_context_p) -> Ptr[Hypothesis]:
     return Ptr()
+
+
+@c_fn(lib)
+def fairseq2_model_layer_config_int(model: ctypes.c_void_p, name: str) -> int:
+    pass

+ 102 - 71
ggml/ggml_convert.py

@@ -68,28 +68,37 @@ def pos_enc(max_seq_len=4096, encoding_dim=1024):
 
     return weight
 
-def convert_model(model_name: str, out: Optional[Path] = None) -> None:
-    if out is None:
-        out = Path(model_name).with_suffix(".ggml")
-
-    # The type of model depends on the name
-    if "unity" in model_name or "seamlessM4T" in model_name:
-        model_config = load_unity_config(model_name)
-        hparams = flatten_config(dataclasses.asdict(model_config), separator="__")
-        print(hparams)
-        model = load_unity_model(model_name)
+def convert_model(
+    model_name: Union[str, torch.nn.Module],
+    out: Optional[Path] = None,
+    hparams: Optional[Dict[str, Any]] = None,
+) -> None:
+    if isinstance(model_name, str):
+        # Load the corresponding fairseq2 model
+        if out is None:
+            out = Path(model_name).with_suffix(".ggml")
+
+        # The type of model depends on the name
+        if "unity" in model_name or "seamlessM4T" in model_name:
+            if hparams is None:
+                model_config = load_unity_config(model_name)
+                hparams = flatten_config(dataclasses.asdict(model_config), separator="__")
+                print(hparams)
+            model = load_unity_model(model_name)
+        else:
+            raise ValueError(f"Unsupported model type: {model_name}")
     else:
-        raise ValueError(f"Unsupported model type: {model_name}")
+        # Use the model passed explicitly
+        assert out is not None, "output path is required when explicitly passing a module"
+        hparams = hparams or {}
+        model = model_name
 
     state_dict = model.state_dict()
     fixup_model(model, state_dict)
+    layer_config = read_layer_config(model)
 
     with out.open("wb") as o:
-        write_ggml_file(o, hparams, state_dict)
-        write_layer_config(o, model)
-
-    with out.with_suffix(".hparams.h").open("w") as h:
-        h.write(generate_hparams_struct(hparams, "unity_hparams"))
+        write_ggml_file(o, hparams, layer_config, state_dict)
 
 
 def _nested_getattr(model: Any, name: str) -> Any:
@@ -120,7 +129,10 @@ def find_children(model: torch.nn.Module, t: type) -> List[Tuple[str, torch.nn.M
 def fixup_model(model: torch.nn.Module, state_dict: Dict[str, torch.Tensor]) -> None:
     # Bake the embedding scaling into the weights
     frontends = find_children(model, TransformerEmbeddingFrontend)
-    print("Upgrading the following TransformerEmbeddingFrontend:", [x[0] for x in frontends])
+    print(
+        "Upgrading the following TransformerEmbeddingFrontend:",
+        [x[0] for x in frontends],
+    )
     for name, frontend in frontends:
         embed_weights = state_dict[name + ".embed.weight"]
         state_dict[name + ".embed.weight"] = embed_weights * frontend.scale
@@ -128,7 +140,10 @@ def fixup_model(model: torch.nn.Module, state_dict: Dict[str, torch.Tensor]) ->
     # Sinusoidal embeddings are typically not saved since they are easily recomputed,
     # but this allows to avoid porting the sinusoidal logic to GGML
     pos_encoders = find_children(model, SinusoidalPositionEncoder)
-    print("Upgrading the following SinusoidalPositionEncoder:", [x[0] for x in pos_encoders])
+    print(
+        "Upgrading the following SinusoidalPositionEncoder:",
+        [x[0] for x in pos_encoders],
+    )
     for name, pos_encoder in pos_encoders:
         assert isinstance(pos_encoder.weight, torch.Tensor)
         assert name not in state_dict
@@ -137,29 +152,19 @@ def fixup_model(model: torch.nn.Module, state_dict: Dict[str, torch.Tensor]) ->
     state_dict["speech_encoder.pos_enc"] = pos_enc()
 
 def write_ggml_file(
-    out: BufferedWriter, hparams: Dict[str, Any], state_dict: Dict[str, torch.Tensor]
+    out: BufferedWriter,
+    hparams: Dict[str, Any],
+    layer_config: Dict[str, Any],
+    state_dict: Dict[str, torch.Tensor],
 ) -> None:
     write_ggml_header(out)
-
-    # Apppend the byte size to the hparams.
-    if "model_byte_size" not in hparams:
-        # Size of each tensor
-        byte_size = sum(x.numel() * x.element_size() for x in state_dict.values())
-        # + tensor overhead
-        byte_size += ggml.ggml_tensor_overhead() * (len(state_dict) + 10)
-        hparams["model_byte_size"] = byte_size
-        logging.warning(
-            f"Saving a ggml file with {len(state_dict)} tensors, for an estimated amount of {byte_size / (1024**3)} GGML Gb"
-        )
-    # 6877961321223123048
-    hparams["__end_of_hparams__"] = struct.unpack("l", b"hparams_")[0]
-
     write_hparams(out, hparams)
+    write_hparams(out, layer_config)
     write_state_dict(out, state_dict)
 
 
 def write_ggml_header(out: BufferedWriter) -> None:
-    """Write GGML header (in reverse cause why not)"""
+    """Write GGML header (in reverse cause big-endian)"""
     out.write(b"ggml"[::-1])
 
 
@@ -170,17 +175,23 @@ def write_hparams(out: BufferedWriter, hparams: Dict[str, Any]) -> None:
         flattened dict containing model's hyper parameters.
 
     """
-    # TODO: should we preprend the size of the hparams struct ?
-    # this would help catch out of sync writer/loader code
+    simple_vals = {}
     for key, value in hparams.items():
         try:
-            # TODO: this is not cross platform, what's the standard way of writing hparams in GGML ?
-            ctype, cvalue = to_ctype(value)
-            out.write(struct.pack(ctype, cvalue))
+            simple_vals[key] = to_ctype(value)
         except ValueError:
             logging.warning(f"Skipping config for key {key}={value!r}")
             continue
 
+    out.write(struct.pack("<q", len(simple_vals)))
+    for key, (ctype, cvalue) in simple_vals.items():
+        write_string(out, key)
+        b = struct.pack(ctype, cvalue)
+        assert len(b) == 8
+        out.write(b)
+
+    logging.info(f"Saved {len(simple_vals)} params.")
+
 
 def write_state_dict(out: BufferedWriter, state_dict: Dict[str, torch.Tensor]) -> None:
     """Write pytorch state dict.
@@ -188,7 +199,15 @@ def write_state_dict(out: BufferedWriter, state_dict: Dict[str, torch.Tensor]) -
     :paras state_dict:
         state dict returned by pytorch model
     """
-    out.write(struct.pack("i", len(state_dict)))
+    out.write(struct.pack("<q", len(state_dict)))
+    # Size of each tensor
+    byte_size = sum(x.numel() * x.element_size() for x in state_dict.values())
+    # + tensor overhead
+    byte_size += ggml.ggml_tensor_overhead() * (len(state_dict) + 10)
+    out.write(struct.pack("<q", byte_size))
+    logging.warning(
+        f"Saving a ggml file with {len(state_dict)} tensors, for an estimated amount of {byte_size / (1024**3):.3f} GGML Gb"
+    )
     for key, value in state_dict.items():
         write_string(out, key)
         if key.endswith(".bias") and value.ndim == 1 and "adaptor" not in key:
@@ -201,27 +220,6 @@ def write_state_dict(out: BufferedWriter, state_dict: Dict[str, torch.Tensor]) -
         write_tensor(out, value.contiguous())
 
 
-def write_layer_config(out: BufferedWriter, model: torch.nn.Module) -> None:
-    for name, node in find_children(model, torch.nn.Module):
-        for k, v in node.__dict__.items():
-            # Skip special members. In particular all children module and tensors
-            # will be hidden in special dicts `_parameters` and `_modules`
-            if k.startswith("_"):
-                continue
-            # All modules have a "training" flag
-            if k == "training":
-                continue
-            if v is None:
-                continue
-            try:
-                ctype, cvalue = to_ctype(v)
-                write_string(out, f"{name}.{k}")
-                out.write(struct.pack(ctype, cvalue))
-            except ValueError as e:
-                logging.warning(f"Skipping config for {name}.{k}={v!r}")
-                continue
-
-
 def write_string(out: BufferedWriter, value: str) -> None:
     """Write string in utf-8 format.
 
@@ -229,7 +227,9 @@ def write_string(out: BufferedWriter, value: str) -> None:
         string value to dump.
     """
     str_ = value.encode("utf-8")
-    out.write(struct.pack("i", len(str_)))
+    packed_len = struct.pack("<i", len(str_))
+    assert len(packed_len) == 4
+    out.write(packed_len)
     out.write(str_)
 
 
@@ -243,7 +243,7 @@ def write_tensor(out: BufferedWriter, value: torch.Tensor) -> None:
         Tensor to dump.
     """
     if value.dtype is torch.int64:
-        # GGML doesn't ahve int64, downcast it
+        # GGML doesn't have int64, downcast it
         value = value.to(dtype=torch.int32)
 
     if value.ndim == 0:
@@ -256,11 +256,11 @@ def write_tensor(out: BufferedWriter, value: torch.Tensor) -> None:
     assert n_dims >= 1, "ggml doesn't support 0 dim tensors"
 
     ftype = torch_to_ggml_type(value.dtype)
-    out.write(struct.pack("i", n_dims))
-    out.write(struct.pack("i", ftype))
+    out.write(struct.pack("<i", n_dims))
+    out.write(struct.pack("<i", ftype))
     for i in range(n_dims):
         # ggml uses long for shape
-        out.write(struct.pack("l", data.shape[n_dims - 1 - i]))
+        out.write(struct.pack("<q", data.shape[n_dims - 1 - i]))
 
     data.tofile(out)
 
@@ -314,9 +314,40 @@ def flatten_config(
     return __flatten(config)
 
 
+def read_layer_config(model: torch.nn.Module) -> Dict[str, Any]:
+    layer_config = {}
+
+    def _append_node_config(node: Any, prefix: str) -> None:
+        for k, v in node.__dict__.items():
+            # Skip special members. In particular all children module and tensors
+            # will be hidden in special dicts `_parameters` and `_modules`
+            if k.startswith("_"):
+                continue
+            # All modules have a "training" flag
+            if k == "training":
+                continue
+            if v is None:
+                continue
+
+            try:
+                to_ctype(v)
+            except ValueError:
+                logging.warning(f"Skipping layer config {k}={v!r}")
+                continue
+            layer_config[prefix + k] = v
+
+    _append_node_config(model, "")
+    for name, node in find_children(model, torch.nn.Module):
+        _append_node_config(node, name + ".")
+    return layer_config
+
+
 def to_ctype(value: Any) -> Tuple[str, Any]:
     """Transform python type to ctype.
 
+    Note: we always use little-endian and 8-byte types.
+    This make the format independent of the current platform.
+
     :params value:
         value to cast into ctype
 
@@ -324,20 +355,20 @@ def to_ctype(value: Any) -> Tuple[str, Any]:
         A tuple of ctype and cvalue.
     """
     if isinstance(value, int):
-        return ("l", value)
+        return ("<q", value)
     if isinstance(value, float):
-        return ("d", value)
+        return ("<d", value)
     if isinstance(value, bool):
-        return ("l", value)
+        return ("<q", value)
     if isinstance(value, Enum):
-        return ("l", value.value)
+        return ("<q", value.value)
     if isinstance(value, tuple) and len(value) == 1:
         return to_ctype(value[0])
     if isinstance(value, str) and len(value) < 8:
         value = bytes(value, "ascii")
         if len(value) < 8:
             value = value + (8 - len(value)) * b"\0"
-        return ("l", struct.unpack("l", value)[0])
+        return ("8s", value)
 
     raise ValueError(f"Unsupported type {type(value)}")
 

+ 0 - 1
ggml/src/CMakeLists.txt

@@ -251,7 +251,6 @@ add_library(${TARGET}
     ggml-alloc.c
     ../examples/unity/fairseq2.cpp
     ../examples/unity/model_loader.cpp
-    ../examples/unity/unity_model_loader.cpp
     ../examples/unity/unity.cpp
     ../examples/common.cpp
     ../examples/common-ggml.cpp

+ 13 - 9
ggml/test_unity_cpp.py

@@ -16,7 +16,7 @@ from typing import Any
 from pathlib import Path
 from typing import Iterator
 from ggml import NativeObj
-from ggml_convert import convert_model
+from ggml_convert import convert_model, read_layer_config
 from seamless_communication.models.inference.translator import Translator, Modality
 from fairseq2.data.audio import WaveformToFbankConverter
 import torchaudio
@@ -46,7 +46,7 @@ def _load_g_model_once() -> NativeObj:
     model_file = Path(__file__).parent / "seamlessM4T_medium.ggml"
     if not model_file.exists():
         convert_model("seamlessM4T_medium", model_file)
-    return ggml.load_unity_ggml_file(model_file)
+    return ggml.load_fairseq2_ggml_file(model_file)
 
 @pytest.fixture()
 def g_model(ctx: Ctx) -> c_void_p:
@@ -65,14 +65,18 @@ def load_pt_model() -> Any:
     return load_translator().model
 
 
-@pytest.mark.xfail(reason="TODO")
-def test_hparams_code_is_up_to_date() -> None:
-    model_file = Path(__file__).parent / "seamlessM4T_medium.ggml"
+def test_convert_linear(tmp_path: Path) -> None:
+    module = fairseq2.nn.Linear(16, 24, True)
+
+    layer_config = read_layer_config(module)
+    assert layer_config == {"input_dim": 16, "output_dim": 24, "skip_init": False}
+
+    module_file = Path("module.ggml")
+    convert_model(module, module_file)
+    g_module = ggml.load_fairseq2_ggml_file(module_file)
 
-    hparams_header_file = model_file.with_suffix(".hparams.h")
-    hparams_struct = hparams_header_file.read_text().strip()
-    actual_code = (UNITY_MODELS.parent / "unity_model_loader.h").read_text()
-    assert hparams_struct in actual_code
+    for k, v in layer_config.items():
+        assert ggml.fairseq2_model_layer_config_int(g_module.ptr, bytes(k, "ascii")) == v
 
 
 def test_causal_attention_mask(ctx: Ctx):