2 年之前 · e73d4eb3f1
--- a/ggml/examples/CMakeLists.txt
+++ b/ggml/examples/CMakeLists.txt
@@ -28,3 +28,4 @@ add_subdirectory(replit)
 
															 add_subdirectory(mpt)
														
 
															 add_subdirectory(starcoder)
														
 
															 add_subdirectory(sam)
														
 
															+add_subdirectory(unity)
														
--- a/ggml/examples/unity/CMakeLists.txt
+++ b/ggml/examples/unity/CMakeLists.txt
@@ -0,0 +1,13 @@
 
															+#
														
 
															+# unity
														
 
															+
														
 
															+set(TEST_TARGET unity)
														
 
															+add_executable(${TEST_TARGET} unity.cpp)
														
 
															+target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
														
 
															+
														
 
															+#
														
 
															+# TODO: unity-quantize
														
 
															+
														
 
															+# set(TEST_TARGET unity-quantize)
														
 
															+# add_executable(${TEST_TARGET} quantize.cpp)
														
 
															+# target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
														
--- a/ggml/examples/unity/convert-pt-to-ggml.py
+++ b/ggml/examples/unity/convert-pt-to-ggml.py
@@ -0,0 +1,84 @@
 
															+# Convert UnitY model from PyTorch to ggml format
														
 
															+#
														
 
															+# Usage: python3.8 /private/home/dnn/ggml/ggml/examples/unity/convert-pt-to-ggml.py /large_experiments/seamless/ust/dnn/unity_large_audio_enc.pt /private/home/dnn/ggml/ggml/examples/unity/models/unity-large
														
 
															+# 
														
 
															+import io
														
 
															+import sys
														
 
															+import struct
														
 
															+import torch
														
 
															+import numpy as np
														
 
															+from pathlib import Path
														
 
															+from convert_pt_states import generate_mapping
														
 
															+
														
 
															+
														
 
															+if len(sys.argv) < 3:
														
 
															+    print("Usage: convert-pt-to-ggml.py model.pt dir-output [use-f32]\n")
														
 
															+    sys.exit(1)
														
 
															+
														
 
															+fname_inp   = Path(sys.argv[1])
														
 
															+dir_out     = Path(sys.argv[2])
														
 
															+
														
 
															+# try to load PyTorch binary data
														
 
															+try:
														
 
															+    model_bytes = open(fname_inp, "rb").read()
														
 
															+    with io.BytesIO(model_bytes) as fp:
														
 
															+        checkpoint = torch.load(fp, map_location="cpu")
														
 
															+except Exception:
														
 
															+    print("Error: failed to load PyTorch model file:" , fname_inp)
														
 
															+    sys.exit(1)
														
 
															+
														
 
															+hparams = {"n_text_vocab": 256064, "n_audio_enc_dim": 1024, "n_audio_enc_ffn_dim": 4096, "n_audio_enc_feat_dim": 160, "n_audio_enc_layer": 24, "n_audio_enc_head": 16}
														
 
															+print("hparams:", hparams)
														
 
															+
														
 
															+list_vars = checkpoint["model"]
														
 
															+state_map = generate_mapping(list_vars)
														
 
															+
														
 
															+# output in the same directory as the model
														
 
															+fname_out = dir_out / "ggml-model.bin"
														
 
															+
														
 
															+# use 16-bit or 32-bit floats
														
 
															+use_f16 = True
														
 
															+if len(sys.argv) > 4:
														
 
															+    use_f16 = False
														
 
															+    fname_out = dir_out / "ggml-model-f32.bin"
														
 
															+
														
 
															+fout = fname_out.open("wb")
														
 
															+
														
 
															+fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
														
 
															+for key in hparams.keys():
														
 
															+    fout.write(struct.pack("i", hparams[key]))
														
 
															+fout.write(struct.pack("i", use_f16))
														
 
															+
														
 
															+exclude_list = []
														
 
															+exclude_list += [f"encoder.w2v_encoder.w2v_model.encoder.layers.{i}.conv_module.batch_norm.num_batches_tracked" for i in range(24)]
														
 
															+
														
 
															+for name in list_vars.keys():
														
 
															+    if list_vars[name] is None or name in exclude_list or "adaptor" in name:
														
 
															+        continue
														
 
															+    data = list_vars[name].squeeze().numpy()
														
 
															+    print("Processing variable: " , name ,  " with shape: ", data.shape)
														
 
															+
														
 
															+    n_dims = len(data.shape)
														
 
															+
														
 
															+    # TODO: Convert to fp16 when necessary!
														
 
															+    ftype = 0
														
 
															+    if name not in state_map:
														
 
															+        continue
														
 
															+    # header
														
 
															+    # if 'pos_bias' in name:
														
 
															+    #     import pdb; pdb.set_trace()
														
 
															+    #     print(data.shape)
														
 
															+    str_ = state_map[name].encode('utf-8')
														
 
															+    fout.write(struct.pack("iii", n_dims, len(str_), ftype))
														
 
															+    for i in range(n_dims):
														
 
															+        fout.write(struct.pack("i", data.shape[i]))
														
 
															+    fout.write(str_)
														
 
															+
														
 
															+    # data
														
 
															+    data.tofile(fout)
														
 
															+
														
 
															+fout.close()
														
 
															+
														
 
															+print("Done. Output file: " , fname_out)
														
 
															+print("")
														
 
															+
														
--- a/ggml/examples/unity/convert_pt_states.py
+++ b/ggml/examples/unity/convert_pt_states.py
@@ -0,0 +1,51 @@
 
															+import torch
														
 
															+def map_state_key(pytorch_key, layer_idx):
														
 
															+    # Replace the layer index first
														
 
															+    pytorch_key = pytorch_key.replace(f".layers.{layer_idx}.", "/")
														
 
															+    
														
 
															+    # Replace common patterns in the state key
														
 
															+    translation_dict = {
														
 
															+        ".weight": "/w",
														
 
															+        ".bias": "/b",
														
 
															+        ".running_mean": "/m", # /running_mean doesn't work
														
 
															+        ".running_var": "/v",
														
 
															+        ".num_batches_tracked": "/n",
														
 
															+        "self_attn.": "self_attn_",
														
 
															+        "conv_module.": "conv_",
														
 
															+        "ffn1.": "ffn1_",
														
 
															+        "ffn2.": "ffn2_",
														
 
															+    }
														
 
															+    
														
 
															+    
														
 
															+    # Special mapping for pos_bias_u and pos_bias_v
														
 
															+    if "self_attn.pos_bias_u" in pytorch_key:
														
 
															+        pytorch_key = pytorch_key.replace("self_attn.pos_bias_u", "self_attn_pos_bias/u")
														
 
															+    elif "self_attn.pos_bias_v" in pytorch_key:
														
 
															+        pytorch_key = pytorch_key.replace("self_attn.pos_bias_v", "self_attn_pos_bias/v")
														
 
															+    for pytorch_pattern, model_pattern in translation_dict.items():
														
 
															+        pytorch_key = pytorch_key.replace(pytorch_pattern, model_pattern)
														
 
															+    
														
 
															+    # Replace the leading pattern and add layer index
														
 
															+    return pytorch_key.replace("encoder.w2v_encoder.w2v_model.encoder/", f"model/h{layer_idx}/")
														
 
															+
														
 
															+
														
 
															+def generate_mapping(state_dict):
														
 
															+    mapping = {}
														
 
															+    for state in state_dict.keys():
														
 
															+        for layer_idx in range(24):
														
 
															+            if f".layers.{layer_idx}." in state:
														
 
															+                new_key = map_state_key(state, layer_idx)
														
 
															+                mapping[state] = new_key
														
 
															+    return mapping
														
 
															+
														
 
															+
														
 
															+# Testing
														
 
															+ckpt = torch.load('/large_experiments/seamless/ust/dnn/unity_large_audio_enc.pt')
														
 
															+state_dict = {}
														
 
															+for key in ckpt['model']:
														
 
															+    if ckpt['model'][key] is not None:
														
 
															+        state_dict[key] = ckpt['model'][key]
														
 
															+
														
 
															+mapped_keys = generate_mapping(state_dict)
														
 
															+for old_key, new_key in mapped_keys.items():
														
 
															+    print(old_key, "=>", new_key)
														
--- a/ggml/examples/unity/unity.cpp
+++ b/ggml/examples/unity/unity.cpp
@@ -0,0 +1,485 @@
 
															+#include "ggml/ggml.h"
														
 
															+#include "ggml/ggml-alloc.h"
														
 
															+
														
 
															+#include "common.h"
														
 
															+#include "common-ggml.h"
														
 
															+
														
 
															+#include <cassert>
														
 
															+#include <cmath>
														
 
															+#include <cstdio>
														
 
															+#include <cstring>
														
 
															+#include <fstream>
														
 
															+#include <map>
														
 
															+#include <string>
														
 
															+#include <vector>
														
 
															+#include <iostream>
														
 
															+// default hparams
														
 
															+struct unity_hparams {
														
 
															+    int32_t n_text_vocab = 256206; 
														
 
															+    int32_t n_unit_vocab = 10084; 
														
 
															+    int32_t n_audio_enc_dim = 1024;
														
 
															+    int32_t n_audio_enc_ffn_dim = 4096;
														
 
															+    int32_t n_audio_enc_feat_dim = 160;
														
 
															+    int32_t n_audio_enc_layer = 24;
														
 
															+    int32_t n_audio_enc_head = 16;
														
 
															+    int32_t ftype   = 1;
														
 
															+    float   eps     = 1e-5f;
														
 
															+};
														
 
															+// layer def
														
 
															+struct audio_enc_layer {
														
 
															+    struct ggml_tensor * self_attn_layer_norm_w;
														
 
															+    struct ggml_tensor * self_attn_layer_norm_b;
														
 
															+
														
 
															+    struct ggml_tensor * self_attn_linear_k_w;
														
 
															+    struct ggml_tensor * self_attn_linear_k_b;
														
 
															+    struct ggml_tensor * self_attn_linear_q_w;
														
 
															+    struct ggml_tensor * self_attn_linear_q_b;
														
 
															+    struct ggml_tensor * self_attn_linear_v_w;
														
 
															+    struct ggml_tensor * self_attn_linear_v_b;
														
 
															+    struct ggml_tensor * self_attn_linear_out_w;
														
 
															+    struct ggml_tensor * self_attn_linear_out_b;
														
 
															+    struct ggml_tensor * self_attn_linear_pos_w;
														
 
															+
														
 
															+    struct ggml_tensor * self_attn_pos_bias_u;
														
 
															+    struct ggml_tensor * self_attn_pos_bias_v;
														
 
															+
														
 
															+    struct ggml_tensor * conv_layer_norm_w;
														
 
															+    struct ggml_tensor * conv_layer_norm_b;
														
 
															+
														
 
															+    struct ggml_tensor * conv_pointwise_conv1_w;
														
 
															+    struct ggml_tensor * conv_depthwise_conv_w;
														
 
															+    struct ggml_tensor * conv_batch_norm_w;
														
 
															+    struct ggml_tensor * conv_batch_norm_b;
														
 
															+    struct ggml_tensor * conv_batch_norm_running_mean;
														
 
															+    struct ggml_tensor * conv_batch_norm_running_var;
														
 
															+    struct ggml_tensor * conv_batch_norm_num_batches_tracked;
														
 
															+    struct ggml_tensor * conv_pointwise_conv2_w;
														
 
															+
														
 
															+    struct ggml_tensor * ffn1_layer_norm_w;
														
 
															+    struct ggml_tensor * ffn1_layer_norm_b;
														
 
															+    struct ggml_tensor * ffn1_w1;
														
 
															+    struct ggml_tensor * ffn1_b1;
														
 
															+    struct ggml_tensor * ffn1_w2;
														
 
															+    struct ggml_tensor * ffn1_b2;
														
 
															+
														
 
															+    struct ggml_tensor * ffn2_layer_norm_w;
														
 
															+    struct ggml_tensor * ffn2_layer_norm_b;
														
 
															+    struct ggml_tensor * ffn2_w1;
														
 
															+    struct ggml_tensor * ffn2_b1;
														
 
															+    struct ggml_tensor * ffn2_w2;
														
 
															+    struct ggml_tensor * ffn2_b2;
														
 
															+
														
 
															+    struct ggml_tensor * final_layer_norm_w;
														
 
															+    struct ggml_tensor * final_layer_norm_b;
														
 
															+};
														
 
															+
														
 
															+// struct ggml_tensor * conv_ln;
														
 
															+// struct ggml_tensor * conv_pool_1d;
														
 
															+
														
 
															+// model def
														
 
															+struct unity_model {
														
 
															+    unity_hparams hparams;
														
 
															+    // audio encoder
														
 
															+    struct ggml_tensor * post_extract_proj;
														
 
															+    struct ggml_tensor * audio_enc_pos_conv;
														
 
															+    std::vector<audio_enc_layer> audio_enc_layers;
														
 
															+
														
 
															+    // text encoder
														
 
															+    // std::vector<text_enc_layer> text_enc_layers;
														
 
															+
														
 
															+    // adaptor
														
 
															+    // std::vector<adapter_layer> adapter_layers;
														
 
															+
														
 
															+    // text decoder
														
 
															+    // std::vector<text_dec_layer> text_dec_layers;
														
 
															+
														
 
															+    // unit decoder
														
 
															+    // std::vector<unit_dec_layer> unit_dec_layers;
														
 
															+
														
 
															+    //
														
 
															+    struct ggml_context * ctx;
														
 
															+    std::map<std::string, struct ggml_tensor *> tensors;
														
 
															+};
														
 
															+
														
 
															+// model load
														
 
															+bool unity_model_load(const std::string & fname, unity_model & model, gpt_vocab & vocab) {
														
 
															+    printf("%s: loading model from '%s'\n", __func__, fname.c_str());
														
 
															+
														
 
															+    auto fin = std::ifstream(fname, std::ios::binary);
														
 
															+    if (!fin) {
														
 
															+        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
														
 
															+        return false;
														
 
															+    }
														
 
															+
														
 
															+    // verify magic
														
 
															+    {
														
 
															+        uint32_t magic;
														
 
															+        fin.read((char *) &magic, sizeof(magic));
														
 
															+        if (magic != GGML_FILE_MAGIC) {
														
 
															+            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
														
 
															+            return false;
														
 
															+        }
														
 
															+    }
														
 
															+
														
 
															+    // load hparams
														
 
															+    {
														
 
															+        auto & hparams = model.hparams;
														
 
															+
														
 
															+        fin.read((char *) &hparams.n_text_vocab, sizeof(hparams.n_text_vocab));
														
 
															+        fin.read((char *) &hparams.n_audio_enc_dim,  sizeof(hparams.n_audio_enc_dim));
														
 
															+        fin.read((char *) &hparams.n_audio_enc_ffn_dim,  sizeof(hparams.n_audio_enc_ffn_dim));
														
 
															+        fin.read((char *) &hparams.n_audio_enc_feat_dim,  sizeof(hparams.n_audio_enc_feat_dim));
														
 
															+        fin.read((char *) &hparams.n_audio_enc_layer, sizeof(hparams.n_audio_enc_layer));
														
 
															+        fin.read((char *) &hparams.n_audio_enc_head, sizeof(hparams.n_audio_enc_head));
														
 
															+        fin.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
														
 
															+
														
 
															+        const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
														
 
															+
														
 
															+        printf("%s: n_text_vocab = %d\n", __func__, hparams.n_text_vocab);
														
 
															+        printf("%s: n_audio_enc_dim   = %d\n", __func__, hparams.n_audio_enc_dim);
														
 
															+        printf("%s: n_audio_enc_ffn_dim  = %d\n", __func__, hparams.n_audio_enc_ffn_dim);
														
 
															+        printf("%s: n_audio_enc_feat_dim = %d\n", __func__, hparams.n_audio_enc_feat_dim);
														
 
															+        printf("%s: n_audio_enc_layer = %d\n", __func__, hparams.n_audio_enc_layer);
														
 
															+        printf("%s: n_audio_enc_head = %d\n", __func__, hparams.n_audio_enc_head);
														
 
															+        printf("%s: ftype   = %d\n", __func__, hparams.ftype);
														
 
															+        printf("%s: qntvr   = %d\n", __func__, qntvr);
														
 
															+
														
 
															+        hparams.ftype %= GGML_QNT_VERSION_FACTOR;
														
 
															+    }
														
 
															+
														
 
															+    // for the big tensors, we have the option to store the data in 16-bit floats or quantized
														
 
															+    // in order to save memory and also to speed up the computation
														
 
															+    ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
														
 
															+    if (wtype == GGML_TYPE_COUNT) {
														
 
															+        fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
														
 
															+                __func__, fname.c_str(), model.hparams.ftype);
														
 
															+        return false;
														
 
															+    }
														
 
															+
														
 
															+    auto & ctx = model.ctx;
														
 
															+
														
 
															+    size_t ctx_size = 0;
														
 
															+
														
 
															+    {
														
 
															+        const auto & hparams = model.hparams;
														
 
															+
														
 
															+        const int n_audio_enc_dim  = hparams.n_audio_enc_dim;
														
 
															+        const int n_audio_enc_ffn_dim  = hparams.n_audio_enc_ffn_dim;
														
 
															+        const int n_audio_enc_layer = hparams.n_audio_enc_layer;
														
 
															+        // const int n_text_vocab = hparams.n_text_vocab;
														
 
															+        const int kernel_size = 31;
														
 
															+
														
 
															+        ctx_size += n_audio_enc_layer*n_audio_enc_dim*ggml_type_sizef(GGML_TYPE_F32); // self_attn_layer_norm_w
														
 
															+        ctx_size += n_audio_enc_layer*n_audio_enc_dim*ggml_type_sizef(GGML_TYPE_F32); // self_attn_layer_norm_b
														
 
															+
														
 
															+        ctx_size += n_audio_enc_layer*(5*n_audio_enc_dim*n_audio_enc_dim*ggml_type_sizef(wtype));         // self_attn_w
														
 
															+        ctx_size += n_audio_enc_layer*(4*n_audio_enc_dim*ggml_type_sizef(GGML_TYPE_F32)); // self_attn_b
														
 
															+
														
 
															+        ctx_size += n_audio_enc_layer*n_audio_enc_dim*ggml_type_sizef(GGML_TYPE_F32); // conv_layer_norm_w
														
 
															+        ctx_size += n_audio_enc_layer*n_audio_enc_dim*ggml_type_sizef(GGML_TYPE_F32); // conv_layer_norm_b
														
 
															+
														
 
															+        ctx_size += n_audio_enc_layer*(n_audio_enc_dim*n_audio_enc_dim*2*ggml_type_sizef(wtype));           // conv_pointwise_conv1_w
														
 
															+        ctx_size += n_audio_enc_dim*ggml_type_sizef(GGML_TYPE_F32); // conv_batch_norm_w
														
 
															+        ctx_size += n_audio_enc_dim*ggml_type_sizef(GGML_TYPE_F32); // conv_batch_norm_b
														
 
															+        ctx_size += n_audio_enc_layer*(n_audio_enc_dim*n_audio_enc_dim*kernel_size*ggml_type_sizef(wtype));         // conv_depthwise_conv_w
														
 
															+        ctx_size += n_audio_enc_layer*(n_audio_enc_dim*n_audio_enc_dim*ggml_type_sizef(wtype));           // conv_pointwise_conv2_w
														
 
															+
														
 
															+        ctx_size += 2 * n_audio_enc_layer * (n_audio_enc_dim*ggml_type_sizef(GGML_TYPE_F32)); // ffn{1,2}_layer_norm_w
														
 
															+        ctx_size += 2 * n_audio_enc_layer * (n_audio_enc_dim*ggml_type_sizef(GGML_TYPE_F32)); // ffn{1,2}_layer_norm_b
														
 
															+        ctx_size += 2 * n_audio_enc_layer * (2 * n_audio_enc_dim * n_audio_enc_ffn_dim * ggml_type_sizef(wtype));  // ffn{1,2}_w{1,2}
														
 
															+        ctx_size += 2 * n_audio_enc_layer * (2 * n_audio_enc_dim * ggml_type_sizef(GGML_TYPE_F32));  // ffn{1,2}_b{1,2}
														
 
															+
														
 
															+        ctx_size += n_audio_enc_layer*(n_audio_enc_dim*ggml_type_sizef(GGML_TYPE_F32)); // final_layer_norm_w
														
 
															+        ctx_size += n_audio_enc_layer*(n_audio_enc_dim*ggml_type_sizef(GGML_TYPE_F32)); // final_layer_norm_b
														
 
															+
														
 
															+        // Adaptor
														
 
															+        // ctx_size += n_audio_enc_layer*(n_audio_enc_dim*ggml_type_sizef(GGML_TYPE_F32)); // conv_ln
														
 
															+        // ctx_size += n_audio_enc_layer*(n_audio_enc_dim*ggml_type_sizef(GGML_TYPE_F32)); // conv_pool_1d
														
 
															+
														
 
															+        // object overhead might differ depending on the structure and other miscellaneous factors
														
 
															+        ctx_size += (6 + 12*n_audio_enc_layer)*512; // updated object overhead
														
 
															+
														
 
															+        printf("%s: ggml tensor size = %d bytes\n", __func__, (int) sizeof(ggml_tensor));
														
 
															+        printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
														
 
															+    }
														
 
															+
														
 
															+    // create the ggml context
														
 
															+    {
														
 
															+        struct ggml_init_params params = {
														
 
															+            /*.mem_size   =*/ ctx_size,
														
 
															+            /*.mem_buffer =*/ NULL,
														
 
															+            /*.no_alloc   =*/ false,
														
 
															+        };
														
 
															+
														
 
															+        model.ctx = ggml_init(params);
														
 
															+        if (!model.ctx) {
														
 
															+            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
														
 
															+            return false;
														
 
															+        }
														
 
															+    }
														
 
															+
														
 
															+    // prepare memory for the weights
														
 
															+    {
														
 
															+        const auto & hparams = model.hparams;
														
 
															+
														
 
															+        const int n_audio_enc_dim  = hparams.n_audio_enc_dim;
														
 
															+        const int n_audio_enc_ffn_dim  = hparams.n_audio_enc_ffn_dim;
														
 
															+        // const int n_audio_enc_feat_dim = hparams.n_audio_enc_feat_dim;
														
 
															+        const int n_audio_enc_layer = hparams.n_audio_enc_layer;
														
 
															+        const int n_audio_enc_head = hparams.n_audio_enc_head;
														
 
															+        // const int n_text_vocab = hparams.n_text_vocab;
														
 
															+
														
 
															+        model.audio_enc_layers.resize(n_audio_enc_layer);
														
 
															+        // model.post_extract_proj_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_audio_enc_dim, n_audio_enc_feat_dim);
														
 
															+        // model.post_extract_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
														
 
															+        // model.tensors["model/post_extract_proj/w"] = model.post_extract_proj_w
														
 
															+        // model.tensors["model/post_extract_proj/b"] = model.post_extract_proj_b
														
 
															+        // model.audio_enc_pos_conv_w = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_audio_enc_dim, n_audio_enc_dim, 1);
														
 
															+        // model.tensors["model/audio_enc_pos_conv/w"] = model.audio_enc_pos_conv_w;
														
 
															+        // model.audio_enc_pos_conv_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
														
 
															+        // model.tensors["model/audio_enc_pos_conv/b"] = model.audio_enc_pos_conv_b;
														
 
															+
														
 
															+        for (int i = 0; i < n_audio_enc_layer; ++i) {
														
 
															+            auto & layer = model.audio_enc_layers[i];
														
 
															+
														
 
															+            layer.self_attn_layer_norm_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
														
 
															+            layer.self_attn_layer_norm_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
														
 
															+
														
 
															+            layer.self_attn_linear_k_w   = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_audio_enc_dim, n_audio_enc_dim);
														
 
															+            layer.self_attn_linear_k_b   = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
														
 
															+            layer.self_attn_linear_q_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_audio_enc_dim, n_audio_enc_dim);
														
 
															+            layer.self_attn_linear_q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
														
 
															+            layer.self_attn_linear_v_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_audio_enc_dim, n_audio_enc_dim);
														
 
															+            layer.self_attn_linear_v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
														
 
															+            layer.self_attn_linear_out_w   = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_audio_enc_dim, n_audio_enc_dim);
														
 
															+            layer.self_attn_linear_out_b   = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
														
 
															+            layer.self_attn_linear_pos_w   = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_audio_enc_dim, n_audio_enc_dim);
														
 
															+
														
 
															+            layer.self_attn_pos_bias_u = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_audio_enc_head, n_audio_enc_dim / n_audio_enc_head);
														
 
															+            layer.self_attn_pos_bias_v = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_audio_enc_head, n_audio_enc_dim / n_audio_enc_head);
														
 
															+
														
 
															+            layer.conv_layer_norm_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
														
 
															+            layer.conv_layer_norm_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
														
 
															+
														
 
															+            layer.conv_pointwise_conv1_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 2*n_audio_enc_dim, n_audio_enc_dim);
														
 
															+            layer.conv_depthwise_conv_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_audio_enc_dim, 31);
														
 
															+
														
 
															+            layer.conv_batch_norm_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
														
 
															+            layer.conv_batch_norm_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
														
 
															+            layer.conv_batch_norm_running_mean = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
														
 
															+            layer.conv_batch_norm_running_var = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
														
 
															+            layer.conv_batch_norm_num_batches_tracked = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); 
														
 
															+
														
 
															+            layer.conv_pointwise_conv2_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_audio_enc_dim, n_audio_enc_dim);
														
 
															+
														
 
															+            layer.ffn1_layer_norm_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
														
 
															+            layer.ffn1_layer_norm_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
														
 
															+
														
 
															+            layer.ffn1_w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_audio_enc_ffn_dim, n_audio_enc_dim);
														
 
															+            layer.ffn1_b1 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_ffn_dim);
														
 
															+
														
 
															+            layer.ffn1_w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_audio_enc_dim, n_audio_enc_ffn_dim);
														
 
															+            layer.ffn1_b2 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
														
 
															+
														
 
															+            layer.ffn2_layer_norm_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
														
 
															+            layer.ffn2_layer_norm_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
														
 
															+
														
 
															+            layer.ffn2_w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_audio_enc_ffn_dim, n_audio_enc_dim);
														
 
															+            layer.ffn2_b1 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_ffn_dim);
														
 
															+
														
 
															+            layer.ffn2_w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_audio_enc_dim, n_audio_enc_ffn_dim);
														
 
															+            layer.ffn2_b2 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
														
 
															+
														
 
															+            layer.final_layer_norm_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
														
 
															+            layer.final_layer_norm_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
														
 
															+
														
 
															+            // map by name
														
 
															+
														
 
															+            model.tensors["model/h" + std::to_string(i) + "/self_attn_layer_norm/w"] = layer.self_attn_layer_norm_w;
														
 
															+            model.tensors["model/h" + std::to_string(i) + "/self_attn_layer_norm/b"] = layer.self_attn_layer_norm_b;
														
 
															+
														
 
															+            model.tensors["model/h" + std::to_string(i) + "/self_attn_linear_k/w"] = layer.self_attn_linear_k_w;
														
 
															+            model.tensors["model/h" + std::to_string(i) + "/self_attn_linear_k/b"] = layer.self_attn_linear_k_b;
														
 
															+            model.tensors["model/h" + std::to_string(i) + "/self_attn_linear_q/w"] = layer.self_attn_linear_q_w;
														
 
															+            model.tensors["model/h" + std::to_string(i) + "/self_attn_linear_q/b"] = layer.self_attn_linear_q_b;
														
 
															+            model.tensors["model/h" + std::to_string(i) + "/self_attn_linear_v/w"] = layer.self_attn_linear_v_w;
														
 
															+            model.tensors["model/h" + std::to_string(i) + "/self_attn_linear_v/b"] = layer.self_attn_linear_v_b;
														
 
															+            model.tensors["model/h" + std::to_string(i) + "/self_attn_linear_out/w"] = layer.self_attn_linear_out_w;
														
 
															+            model.tensors["model/h" + std::to_string(i) + "/self_attn_linear_out/b"] = layer.self_attn_linear_out_b;
														
 
															+            model.tensors["model/h" + std::to_string(i) + "/self_attn_linear_pos/w"] = layer.self_attn_linear_pos_w;
														
 
															+            model.tensors["model/h" + std::to_string(i) + "/self_attn_pos_bias/u"] = layer.self_attn_pos_bias_u;
														
 
															+            model.tensors["model/h" + std::to_string(i) + "/self_attn_pos_bias/v"] = layer.self_attn_pos_bias_v;
														
 
															+
														
 
															+            model.tensors["model/h" + std::to_string(i) + "/conv_layer_norm/w"]        = layer.conv_layer_norm_w;
														
 
															+            model.tensors["model/h" + std::to_string(i) + "/conv_layer_norm/b"]        = layer.conv_layer_norm_b;
														
 
															+
														
 
															+            model.tensors["model/h" + std::to_string(i) + "/conv_pointwise_conv1/w"] = layer.conv_pointwise_conv1_w;
														
 
															+            model.tensors["model/h" + std::to_string(i) + "/conv_depthwise_conv/w"] = layer.conv_depthwise_conv_w;
														
 
															+            model.tensors["model/h" + std::to_string(i) + "/conv_batch_norm/w"] = layer.conv_batch_norm_w;
														
 
															+            model.tensors["model/h" + std::to_string(i) + "/conv_batch_norm/b"] = layer.conv_batch_norm_b;
														
 
															+            model.tensors["model/h" + std::to_string(i) + "/conv_batch_norm/m"] = layer.conv_batch_norm_running_mean;
														
 
															+            model.tensors["model/h" + std::to_string(i) + "/conv_batch_norm/v"] = layer.conv_batch_norm_running_var;
														
 
															+            model.tensors["model/h" + std::to_string(i) + "/conv_batch_norm/n"] = layer.conv_batch_norm_num_batches_tracked;
														
 
															+            model.tensors["model/h" + std::to_string(i) + "/conv_pointwise_conv2/w"] = layer.conv_pointwise_conv2_w;
														
 
															+
														
 
															+            model.tensors["model/h" + std::to_string(i) + "/ffn1_layer_norm/w"] = layer.ffn1_layer_norm_w;
														
 
															+            model.tensors["model/h" + std::to_string(i) + "/ffn1_layer_norm/b"] = layer.ffn1_layer_norm_b;
														
 
															+            model.tensors["model/h" + std::to_string(i) + "/ffn1_w_1/w"] = layer.ffn1_w1;
														
 
															+            model.tensors["model/h" + std::to_string(i) + "/ffn1_w_1/b"] = layer.ffn1_b1;
														
 
															+            model.tensors["model/h" + std::to_string(i) + "/ffn1_w_2/w"] = layer.ffn1_w2;
														
 
															+            model.tensors["model/h" + std::to_string(i) + "/ffn1_w_2/b"] = layer.ffn1_b2;
														
 
															+
														
 
															+            model.tensors["model/h" + std::to_string(i) + "/ffn2_layer_norm/w"] = layer.ffn2_layer_norm_w;
														
 
															+            model.tensors["model/h" + std::to_string(i) + "/ffn2_layer_norm/b"] = layer.ffn2_layer_norm_b;
														
 
															+            model.tensors["model/h" + std::to_string(i) + "/ffn2_w_1/w"] = layer.ffn2_w1;
														
 
															+            model.tensors["model/h" + std::to_string(i) + "/ffn2_w_1/b"] = layer.ffn2_b1;
														
 
															+            model.tensors["model/h" + std::to_string(i) + "/ffn2_w_2/w"] = layer.ffn2_w2;
														
 
															+            model.tensors["model/h" + std::to_string(i) + "/ffn2_w_2/b"] = layer.ffn2_b2;
														
 
															+
														
 
															+            model.tensors["model/h" + std::to_string(i) + "/final_layer_norm/w"] = layer.final_layer_norm_w;
														
 
															+            model.tensors["model/h" + std::to_string(i) + "/final_layer_norm/b"] = layer.final_layer_norm_b;
														
 
															+        }
														
 
															+    }
														
 
															+
														
 
															+   
														
 
															+    // load weights
														
 
															+    {
														
 
															+        size_t total_size = 0;
														
 
															+        while (true) {
														
 
															+            int32_t n_dims;
														
 
															+            int32_t length;
														
 
															+            int32_t ttype;
														
 
															+
														
 
															+            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
														
 
															+            fin.read(reinterpret_cast<char *>(&length), sizeof(length));
														
 
															+            fin.read(reinterpret_cast<char *>(&ttype),  sizeof(ttype));
														
 
															+
														
 
															+            if (fin.eof()) {
														
 
															+                break;
														
 
															+            }
														
 
															+
														
 
															+            int32_t nelements = 1;
														
 
															+            int32_t ne[3] = { 1, 1, 1};
														
 
															+            for (int i = 0; i < n_dims; ++i) {
														
 
															+                fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
														
 
															+                nelements *= ne[i];
														
 
															+            }
														
 
															+
														
 
															+            std::string name(length, 0);
														
 
															+            fin.read(&name[0], length);
														
 
															+
														
 
															+            std::cout << "loading " << name << " " << n_dims << std::endl;
														
 
															+
														
 
															+            if (model.tensors.find(name) == model.tensors.end()) {
														
 
															+                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str());
														
 
															+                return false;
														
 
															+            }
														
 
															+
														
 
															+            auto tensor = model.tensors[name];
														
 
															+            if (ggml_nelements(tensor) != nelements) {
														
 
															+                fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.c_str());
														
 
															+                return false;
														
 
															+            }
														
 
															+
														
 
															+            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
														
 
															+                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
														
 
															+                        __func__, name.c_str(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
														
 
															+                return false;
														
 
															+            }
														
 
															+
														
 
															+            // for debugging
														
 
															+            if (0) {
														
 
															+                printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
														
 
															+            }
														
 
															+
														
 
															+            const size_t bpe = ggml_type_size(ggml_type(ttype));
														
 
															+
														
 
															+            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
														
 
															+                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
														
 
															+                        __func__, name.c_str(), ggml_nbytes(tensor), nelements*bpe);
														
 
															+                return false;
														
 
															+            }
														
 
															+
														
 
															+            fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
														
 
															+
														
 
															+            total_size += ggml_nbytes(tensor);
														
 
															+        }
														
 
															+
														
 
															+        printf("%s: model size  = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
														
 
															+    }
														
 
															+
														
 
															+    fin.close();
														
 
															+
														
 
															+    return true;
														
 
															+}
														
 
															+
														
 
															+int main(int argc, char ** argv) {
														
 
															+    // ggml_time_init();
														
 
															+
														
 
															+    // const int64_t t_main_start_us = ggml_time_us();
														
 
															+
														
 
															+    gpt_params params;
														
 
															+    params.model = "models/gpt-2-117M/ggml-model.bin";
														
 
															+
														
 
															+    if (gpt_params_parse(argc, argv, params) == false) {
														
 
															+        return 1;
														
 
															+    }
														
 
															+
														
 
															+    if (params.seed < 0) {
														
 
															+        params.seed = time(NULL);
														
 
															+    }
														
 
															+
														
 
															+    printf("%s: seed = %d\n", __func__, params.seed);
														
 
															+
														
 
															+    std::mt19937 rng(params.seed);
														
 
															+    if (params.prompt.empty()) {
														
 
															+        params.prompt = gpt_random_prompt(rng);
														
 
															+    }
														
 
															+
														
 
															+    // int64_t t_load_us = 0;
														
 
															+
														
 
															+    gpt_vocab vocab;
														
 
															+    unity_model model;
														
 
															+
														
 
															+    // load the model
														
 
															+    {
														
 
															+        // const int64_t t_start_us = ggml_time_us();
														
 
															+
														
 
															+        if (!unity_model_load(params.model, model, vocab)) {
														
 
															+            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
														
 
															+            return 1;
														
 
															+        }
														
 
															+
														
 
															+        // t_load_us = ggml_time_us() - t_start_us;
														
 
															+
														
 
															+        // test_gpt_tokenizer(vocab, params.token_test);
														
 
															+    }
														
 
															+
														
 
															+    // keep this buffer alive while evaluating the model
														
 
															+    // std::vector<uint8_t> compute_buffer;
														
 
															+
														
 
															+    // struct ggml_allocr * allocr = NULL;
														
 
															+    // // allocate the compute buffer
														
 
															+    // {
														
 
															+    //     allocr = ggml_allocr_new_measure(GGML_MEM_ALIGN);
														
 
															+
														
 
															+    //     // create the worst case graph for memory usage estimation
														
 
															+    //     int n_tokens = std::min(model.hparams.n_ctx, params.n_batch);
														
 
															+    //     int n_past = model.hparams.n_ctx - n_tokens;
														
 
															+    //     struct ggml_cgraph * gf = gpt2_graph(model, allocr, n_past, std::vector<gpt_vocab::id>(n_tokens, 0));
														
 
															+
														
 
															+    //     // compute the required memory
														
 
															+    //     size_t mem_size = ggml_allocr_alloc_graph(allocr, gf) + GGML_MEM_ALIGN;
														
 
															+
														
 
															+    //     // recreate the allocator with the required memory
														
 
															+    //     ggml_allocr_free(allocr);
														
 
															+    //     compute_buffer.resize(mem_size);
														
 
															+    //     allocr = ggml_allocr_new(compute_buffer.data(), mem_size, GGML_MEM_ALIGN);
														
 
															+
														
 
															+    //     fprintf(stderr, "%s: compute buffer size: %.2f MB\n", __func__, mem_size/1024.0/1024.0);
														
 
															+    // }
														
 
															+
														
 
															+    ggml_free(model.ctx);
														
 
															+
														
 
															+    return 0;
														
 
															+}