2 years ago · a80a3b49f3
--- a/ggml/examples/unity/model_loader.h
+++ b/ggml/examples/unity/model_loader.h
@@ -25,8 +25,6 @@ public:
 
				 
			
 
				     virtual std::size_t compute_context_size(void *raw_hparams) = 0;
			
 
				 
			
 
				-    virtual void tensors_alloc(fairseq2_model& model) = 0;
			
 
				-
			
 
				     int load_model_weights(fairseq2_model &model, std::ifstream &fin);
			
 
				 
			
 
				 private:
			
--- a/ggml/examples/unity/unity_model_loader.cpp
+++ b/ggml/examples/unity/unity_model_loader.cpp
@@ -28,46 +28,6 @@ unity_model_loader::compute_context_size(void* raw_hparams)
 
				     return hparams->model_byte_size;
			
 
				 };
			
 
				 
			
 
				-struct UnityArch {
			
 
				-    struct TransformerDecoder text_decoder;
			
 
				-};
			
 
				-
			
 
				-void unity_model_loader::tensors_alloc(fairseq2_model &model)
			
 
				-{
			
 
				-    auto hparams = (unity_hparams&)model.hparams;
			
 
				-    auto& arch = (UnityArch&)model.arch;
			
 
				-    const auto ctx = model.ctx;
			
 
				-    auto tensors = model.tensors;
			
 
				-
			
 
				-    const auto vocab_size = hparams.nllb_config__vocabulary_size;
			
 
				-    const auto model_dim = hparams.nllb_config__model_dim;
			
 
				-
			
 
				-    // This can be simplified by adding syntax sugar
			
 
				-
			
 
				-    // frontend
			
 
				-    // arch.frontend_embed_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, vocab_size, dim);
			
 
				-    // tensor_map["text_decoder_frontend.embed.weight"] = arch.frontend_embed_w;
			
 
				-
			
 
				-    // layers
			
 
				-    {
			
 
				-        const auto n_layers = hparams.nllb_config__num_decoder_layers;
			
 
				-        arch.text_decoder.layers = std::vector<TransformerDecoderLayer>(n_layers);
			
 
				-        auto layers = arch.text_decoder.layers;
			
 
				-        auto num_heads = hparams.nllb_config__num_decoder_attn_heads;
			
 
				-        for (int i = 0; i < n_layers; ++i) {
			
 
				-            auto prefix = "text_decoder.layers." + std::to_string(i);
			
 
				-            MultiheadAttention_init(layers[i].self_attn, model, prefix + "self_attn", model_dim, num_heads);
			
 
				-            LayerNorm_init(layers[i].self_attn_norm, model, prefix + "self_attn_norm", model_dim);
			
 
				-        }
			
 
				-    }
			
 
				-
			
 
				-    // // layer_norm
			
 
				-    // arch.layer_norm_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, dim);
			
 
				-    // tensor_map["text_decoder.layer_norm.weight"] = arch.layer_norm_w;
			
 
				-    // arch.layer_norm_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, dim);
			
 
				-    // tensor_map["text_decoder.layer_norm.bias"] = arch.layer_norm_b;
			
 
				-};
			
 
				-
			
 
				 extern "C" int load_unity_ggml_file(fairseq2_model& model, const char* fname) {
			
 
				     return load_fairseq2_ggml_file<unity_model_loader>(model, fname);
			
 
				 }
			
--- a/ggml/examples/unity/unity_model_loader.h
+++ b/ggml/examples/unity/unity_model_loader.h
@@ -129,132 +129,9 @@ void read_unity_hparams(unity_hparams* out, std::ifstream &fin) {
 
				 
			
 
				 };
			
 
				 
			
 
				-
			
 
				-
			
 
				-
			
 
				-
			
 
				-// Embedding
			
 
				-std::size_t compute_embed_size(int32_t vocab_size, int32_t dim)
			
 
				-{
			
 
				-    return vocab_size * dim * ggml_type_size(GGML_TYPE_F32);
			
 
				-};
			
 
				-
			
 
				-// Attention Layer
			
 
				-
			
 
				-struct attention_layer {
			
 
				-    struct ggml_tensor* layer_norm_w; // model_dim
			
 
				-    struct ggml_tensor* layer_norm_b; // model_dim
			
 
				-
			
 
				-    struct ggml_tensor* q_proj_w; // model_dim x model_dim
			
 
				-    struct ggml_tensor* q_proj_b; // model_dim
			
 
				-    struct ggml_tensor* k_proj_w; // model_dim x model_dim
			
 
				-    struct ggml_tensor* k_proj_b; // model_dim
			
 
				-    struct ggml_tensor* v_proj_w; // model_dim x model_dim
			
 
				-    struct ggml_tensor* v_proj_b; // model_dim
			
 
				-
			
 
				-    struct ggml_tensor* output_proj_w; // model_dim x model_dim
			
 
				-    struct ggml_tensor* output_proj_b; // model_dim
			
 
				-};
			
 
				-
			
 
				-std::size_t compute_attention_layer_size(int32_t dim)
			
 
				-{
			
 
				-    return LayerNorm_size(dim)
			
 
				-        + 4 * Linear_size(dim, dim); // q, k, v, and out
			
 
				-};
			
 
				-
			
 
				-void init_attention_layer(
			
 
				-    attention_layer *layer,
			
 
				-    fairseq2_model &model_ctx,
			
 
				-    const std::string &prefix)
			
 
				-{
			
 
				-    auto hparams = (unity_hparams&)model_ctx.hparams;
			
 
				-    const auto dim = hparams.nllb_config__model_dim;
			
 
				-    auto ctx = model_ctx.ctx;
			
 
				-    auto &tensor_map = model_ctx.tensors;
			
 
				-
			
 
				-    layer->layer_norm_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, dim);
			
 
				-    tensor_map[prefix + "_layer_norm.weight"] = layer->layer_norm_w;
			
 
				-    layer->layer_norm_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, dim);
			
 
				-    tensor_map[prefix + "_layer_norm.bias"] = layer->layer_norm_b;
			
 
				-
			
 
				-    layer->q_proj_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, dim, dim);
			
 
				-    tensor_map[prefix + ".q_proj.weight"] = layer->q_proj_w;
			
 
				-    layer->q_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, dim);
			
 
				-    tensor_map[prefix + ".q_proj.bias"] = layer->q_proj_b;
			
 
				-
			
 
				-    layer->k_proj_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, dim, dim);
			
 
				-    tensor_map[prefix + ".k_proj.weight"] = layer->k_proj_w;
			
 
				-    layer->k_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, dim);
			
 
				-    tensor_map[prefix + ".k_proj.bias"] = layer->k_proj_b;
			
 
				-
			
 
				-    layer->v_proj_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, dim, dim);
			
 
				-    tensor_map[prefix + ".v_proj.weight"] = layer->v_proj_w;
			
 
				-    layer->v_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, dim);
			
 
				-    tensor_map[prefix + ".v_proj.bias"] = layer->v_proj_b;
			
 
				-
			
 
				-    layer->output_proj_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, dim, dim);
			
 
				-    tensor_map[prefix + ".output_proj.weight"] = layer->output_proj_w;
			
 
				-    layer->output_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, dim);
			
 
				-    tensor_map[prefix + ".output_proj.bias"] = layer->output_proj_b;
			
 
				-}
			
 
				-
			
 
				-
			
 
				-// Attention Head
			
 
				-
			
 
				-struct attention_head {
			
 
				-    struct attention_layer* self_attn; // model_dim
			
 
				-    struct attention_layer* encoder_decoder_attn; // model_dim
			
 
				-    struct StandardFeedForwardNetwork* ffn;
			
 
				-};
			
 
				-
			
 
				-std::size_t compute_attention_head_size(int32_t dim, int32_t inner_dim)
			
 
				-{
			
 
				-    return 2 * compute_attention_layer_size(dim) + StandardFeedForwardNetwork_size(dim, inner_dim);
			
 
				-};
			
 
				-
			
 
				-void init_attention_head(
			
 
				-    attention_head *head,
			
 
				-    fairseq2_model &model_ctx,
			
 
				-    const std::string &prefix)
			
 
				-{
			
 
				-    auto hparams = (unity_hparams&)model_ctx.hparams;
			
 
				-    init_attention_layer(head->self_attn, model_ctx, prefix + ".self_attn");
			
 
				-    init_attention_layer(head->encoder_decoder_attn, model_ctx, prefix + ".encoder_decoder_attn");
			
 
				-    StandardFeedForwardNetwork_init((StandardFeedForwardNetwork&)(head->ffn), model_ctx, prefix + ".ffn", hparams.nllb_config__model_dim, hparams.nllb_config__ffn_inner_dim);
			
 
				-}
			
 
				-
			
 
				-// TODO: attention_head_compute_graph
			
 
				-
			
 
				-// Text Decoder
			
 
				-
			
 
				-struct text_decoder {
			
 
				-    struct ggml_tensor* frontend_embed_w; // vocab_size x model_dim
			
 
				-    std::vector<attention_head*> multi_head;
			
 
				-    struct ggml_tensor* layer_norm_w;
			
 
				-    struct ggml_tensor* layer_norm_b;
			
 
				-};
			
 
				-
			
 
				-std::size_t compute_context_size(void* raw_hparams)
			
 
				-{
			
 
				-    auto hparams = (unity_hparams&)raw_hparams;
			
 
				-    const auto vocab_size = hparams.nllb_config__vocabulary_size;
			
 
				-    const auto dim = hparams.nllb_config__model_dim;
			
 
				-    const auto inner_dim = hparams.nllb_config__ffn_inner_dim;
			
 
				-    const auto n_layers = hparams.nllb_config__num_decoder_layers;
			
 
				-
			
 
				-    const auto overhead = (6 + 12 * n_layers) * 512; // TODO Find out what this is.
			
 
				-
			
 
				-    return compute_embed_size(vocab_size, dim)
			
 
				-        + n_layers * compute_attention_head_size(dim, inner_dim)
			
 
				-        + LayerNorm_size(dim)
			
 
				-        + overhead;
			
 
				-};
			
 
				-
			
 
				 class unity_model_loader: public model_loader {
			
 
				     public:
			
 
				     void load_hparams(fairseq2_model& model, std::ifstream &fin);
			
 
				 
			
 
				     std::size_t compute_context_size(void* raw_hparams);
			
 
				-
			
 
				-    void tensors_alloc(fairseq2_model &model);
			
 
				 };
			
--- a/ggml/ggml.py
+++ b/ggml/ggml.py
@@ -210,14 +210,6 @@ def FixedSizeArena(mem_size: int) -> NativeObj:
 
				     return arena
			
 
				 
			
 
				 
			
 
				-def UnityModel() -> NativeObj:
			
 
				-    return NativeObj("unity_model")
			
 
				-
			
 
				-
			
 
				-def GptVocab() -> NativeObj:
			
 
				-    return NativeObj("gpt_vocab")
			
 
				-
			
 
				-
			
 
				 lib.fairseq2_model_set_inference_ctx.argtypes = [ctypes.c_void_p, ggml_context_p]
			
 
				 
			
 
				 
			
@@ -239,20 +231,6 @@ def CppStr(content: str) -> NativeObj:
 
				     return NativeObj("std_string", cpp_str)
			
 
				 
			
 
				 
			
 
				-lib.unity_model_load.argtypes = [ctypes.c_char_p, ctypes.c_void_p, ctypes.c_void_p]
			
 
				-
			
 
				-
			
 
				-def unity_model_load(model_file: Path) -> Tuple[NativeObj, NativeObj]:
			
 
				-    model = UnityModel()
			
 
				-    vocab = GptVocab()
			
 
				-    lib.unity_model_load(
			
 
				-        ctypes.create_string_buffer(str(model_file).encode("utf-8")),
			
 
				-        model.ptr,
			
 
				-        vocab.ptr,
			
 
				-    )
			
 
				-    return model, vocab
			
 
				-
			
 
				-
			
 
				 lib.load_unity_ggml_file.argtypes = [ctypes.c_void_p, ctypes.c_char_p]
			
 
				 lib.load_unity_ggml_file.restype = ctypes.c_int
			
 
				 
			
@@ -266,27 +244,27 @@ def load_unity_ggml_file(model_file: Path) -> NativeObj:
 
				     return model
			
 
				 
			
 
				 
			
 
				-lib.unity_audio_encoder_graph.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
			
 
				-lib.unity_audio_encoder_graph.restype = ctypes.POINTER(ggml_cgraph)
			
 
				+# lib.unity_audio_encoder_graph.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
			
 
				+# lib.unity_audio_encoder_graph.restype = ctypes.POINTER(ggml_cgraph)
			
 
				 
			
 
				 
			
 
				-def unity_audio_encoder_graph(model: NativeObj, tensor: ggml_tensor_p) -> ggml_cgraph_p:
			
 
				-    return lib.unity_audio_encoder_graph(model.ptr, tensor)  # type: ignore
			
 
				+# def unity_audio_encoder_graph(model: NativeObj, tensor: ggml_tensor_p) -> ggml_cgraph_p:
			
 
				+#     return lib.unity_audio_encoder_graph(model.ptr, tensor)  # type: ignore
			
 
				 
			
 
				 
			
 
				-lib.unity_eval.argtypes = [
			
 
				-    ctypes.c_void_p,
			
 
				-    ctypes.c_void_p,
			
 
				-    ctypes.POINTER(ggml_tensor),
			
 
				-    ctypes.c_int,
			
 
				-]
			
 
				-lib.unity_eval.restype = ctypes.POINTER(ggml_cgraph)
			
 
				+# lib.unity_eval.argtypes = [
			
 
				+#     ctypes.c_void_p,
			
 
				+#     ctypes.c_void_p,
			
 
				+#     ctypes.POINTER(ggml_tensor),
			
 
				+#     ctypes.c_int,
			
 
				+# ]
			
 
				+# lib.unity_eval.restype = ctypes.POINTER(ggml_cgraph)
			
 
				 
			
 
				 
			
 
				-def unity_eval(
			
 
				-    allocr: ctypes.c_void_p, model: NativeObj, tensor: ggml_tensor_p, n_threads: int
			
 
				-) -> ggml_cgraph_p:
			
 
				-    return lib.unity_eval(allocr, model.ptr, tensor, n_threads)
			
 
				+# def unity_eval(
			
 
				+#     allocr: ctypes.c_void_p, model: NativeObj, tensor: ggml_tensor_p, n_threads: int
			
 
				+# ) -> ggml_cgraph_p:
			
 
				+#     return lib.unity_eval(allocr, model.ptr, tensor, n_threads)
			
 
				 
			
 
				 
			
 
				 _FORWARD_CACHE: Dict[str, Callable[..., ggml_tensor_p]] = {}