Guillaume Wenzek 1 жил өмнө
parent
commit
5024f43b15

+ 69 - 54
ggml/examples/unity/unity.cpp

@@ -25,10 +25,15 @@ struct unity_hparams {
     int32_t ftype   = 1;
     float   eps     = 1e-5f;
 };
+
 // layer def
+struct layer_norm_layer {
+    struct ggml_tensor * w;
+    struct ggml_tensor * b;
+};
+
 struct audio_enc_layer {
-    struct ggml_tensor * self_attn_layer_norm_w;
-    struct ggml_tensor * self_attn_layer_norm_b;
+    struct layer_norm_layer self_attn_layer_norm;
 
     struct ggml_tensor * self_attn_linear_k_w;
     struct ggml_tensor * self_attn_linear_k_b;
@@ -43,8 +48,7 @@ struct audio_enc_layer {
     struct ggml_tensor * self_attn_pos_bias_u;
     struct ggml_tensor * self_attn_pos_bias_v;
 
-    struct ggml_tensor * conv_layer_norm_w;
-    struct ggml_tensor * conv_layer_norm_b;
+    struct layer_norm_layer conv_layer_norm;
 
     struct ggml_tensor * conv_pointwise_conv1_w;
     struct ggml_tensor * conv_depthwise_conv_w;
@@ -55,22 +59,19 @@ struct audio_enc_layer {
     struct ggml_tensor * conv_batch_norm_num_batches_tracked;
     struct ggml_tensor * conv_pointwise_conv2_w;
 
-    struct ggml_tensor * ffn1_layer_norm_w;
-    struct ggml_tensor * ffn1_layer_norm_b;
+    struct layer_norm_layer ffn1_layer_norm;
     struct ggml_tensor * ffn1_w1;
     struct ggml_tensor * ffn1_b1;
     struct ggml_tensor * ffn1_w2;
     struct ggml_tensor * ffn1_b2;
 
-    struct ggml_tensor * ffn2_layer_norm_w;
-    struct ggml_tensor * ffn2_layer_norm_b;
+    struct layer_norm_layer ffn2_layer_norm;
     struct ggml_tensor * ffn2_w1;
     struct ggml_tensor * ffn2_b1;
     struct ggml_tensor * ffn2_w2;
     struct ggml_tensor * ffn2_b2;
 
-    struct ggml_tensor * final_layer_norm_w;
-    struct ggml_tensor * final_layer_norm_b;
+    struct layer_norm_layer final_layer_norm;
 };
 
 // struct ggml_tensor * conv_ln;
@@ -85,11 +86,9 @@ struct unity_model {
     struct ggml_tensor * audio_enc_pos_conv_wg;
     struct ggml_tensor * audio_enc_pos_conv_wv;
     struct ggml_tensor * audio_enc_pos_conv_b;
-    struct ggml_tensor * audio_enc_layer_norm_w;
-    struct ggml_tensor * audio_enc_layer_norm_b;
+    struct layer_norm_layer audio_enc_layer_norm;
     struct ggml_tensor * audio_enc_pos_enc_w;
-    struct ggml_tensor * layer_norm_w;
-    struct ggml_tensor * layer_norm_b;
+    struct layer_norm_layer layer_norm;
     struct ggml_tensor * memory_k;
     struct ggml_tensor * memory_v;
     std::vector<audio_enc_layer> audio_enc_layers;
@@ -197,14 +196,14 @@ extern "C" bool unity_model_load(const char* fname, unity_model& model, gpt_voca
         // const int n_text_vocab = hparams.n_text_vocab;
         const int kernel_size = 31;
 
-        ctx_size += n_audio_enc_layer*n_audio_enc_dim*ggml_type_sizef(GGML_TYPE_F32); // self_attn_layer_norm_w
-        ctx_size += n_audio_enc_layer*n_audio_enc_dim*ggml_type_sizef(GGML_TYPE_F32); // self_attn_layer_norm_b
+        ctx_size += n_audio_enc_layer*n_audio_enc_dim*ggml_type_sizef(GGML_TYPE_F32); // self_attn_layer_norm.w
+        ctx_size += n_audio_enc_layer*n_audio_enc_dim*ggml_type_sizef(GGML_TYPE_F32); // self_attn_layer_norm.b
 
         ctx_size += n_audio_enc_layer*(5*n_audio_enc_dim*n_audio_enc_dim*ggml_type_sizef(wtype));         // self_attn_w
         ctx_size += n_audio_enc_layer*(4*n_audio_enc_dim*ggml_type_sizef(GGML_TYPE_F32)); // self_attn_b
 
-        ctx_size += n_audio_enc_layer*n_audio_enc_dim*ggml_type_sizef(GGML_TYPE_F32); // conv_layer_norm_w
-        ctx_size += n_audio_enc_layer*n_audio_enc_dim*ggml_type_sizef(GGML_TYPE_F32); // conv_layer_norm_b
+        ctx_size += n_audio_enc_layer*n_audio_enc_dim*ggml_type_sizef(GGML_TYPE_F32); // conv_layer_norm.w
+        ctx_size += n_audio_enc_layer*n_audio_enc_dim*ggml_type_sizef(GGML_TYPE_F32); // conv_layer_norm.b
 
         ctx_size += n_audio_enc_layer*(n_audio_enc_dim*n_audio_enc_dim*2*ggml_type_sizef(wtype));           // conv_pointwise_conv1_w
         ctx_size += n_audio_enc_dim*ggml_type_sizef(GGML_TYPE_F32); // conv_batch_norm_w
@@ -212,13 +211,13 @@ extern "C" bool unity_model_load(const char* fname, unity_model& model, gpt_voca
         ctx_size += n_audio_enc_layer*(n_audio_enc_dim*n_audio_enc_dim*kernel_size*ggml_type_sizef(wtype));         // conv_depthwise_conv_w
         ctx_size += n_audio_enc_layer*(n_audio_enc_dim*n_audio_enc_dim*ggml_type_sizef(wtype));           // conv_pointwise_conv2_w
 
-        ctx_size += 2 * n_audio_enc_layer * (n_audio_enc_dim*ggml_type_sizef(GGML_TYPE_F32)); // ffn{1,2}_layer_norm_w
-        ctx_size += 2 * n_audio_enc_layer * (n_audio_enc_dim*ggml_type_sizef(GGML_TYPE_F32)); // ffn{1,2}_layer_norm_b
+        ctx_size += 2 * n_audio_enc_layer * (n_audio_enc_dim*ggml_type_sizef(GGML_TYPE_F32)); // ffn{1,2}_layer_norm.w
+        ctx_size += 2 * n_audio_enc_layer * (n_audio_enc_dim*ggml_type_sizef(GGML_TYPE_F32)); // ffn{1,2}_layer_norm.b
         ctx_size += 2 * n_audio_enc_layer * (2 * n_audio_enc_dim * n_audio_enc_ffn_dim * ggml_type_sizef(wtype));  // ffn{1,2}_w{1,2}
         ctx_size += 2 * n_audio_enc_layer * (2 * n_audio_enc_dim * ggml_type_sizef(GGML_TYPE_F32));  // ffn{1,2}_b{1,2}
 
-        ctx_size += n_audio_enc_layer*(n_audio_enc_dim*ggml_type_sizef(GGML_TYPE_F32)); // final_layer_norm_w
-        ctx_size += n_audio_enc_layer*(n_audio_enc_dim*ggml_type_sizef(GGML_TYPE_F32)); // final_layer_norm_b
+        ctx_size += n_audio_enc_layer*(n_audio_enc_dim*ggml_type_sizef(GGML_TYPE_F32)); // final_layer_norm.w
+        ctx_size += n_audio_enc_layer*(n_audio_enc_dim*ggml_type_sizef(GGML_TYPE_F32)); // final_layer_norm.b
 
         ctx_size += n_ctx*n_audio_enc_layer*n_audio_enc_dim*ggml_type_sizef(GGML_TYPE_F32); // memory_k
         ctx_size += n_ctx*n_audio_enc_layer*n_audio_enc_dim*ggml_type_sizef(GGML_TYPE_F32); // memory_v
@@ -278,23 +277,23 @@ extern "C" bool unity_model_load(const char* fname, unity_model& model, gpt_voca
         model.tensors["model/enc/pos_conv/w_v"] = model.audio_enc_pos_conv_wv;
         model.tensors["model/enc/pos_conv/b"] = model.audio_enc_pos_conv_b;
 
-        model.audio_enc_layer_norm_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
-        model.audio_enc_layer_norm_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
-        model.tensors["model/enc/layer_norm/w"] = model.audio_enc_layer_norm_w;
-        model.tensors["model/enc/layer_norm/b"] = model.audio_enc_layer_norm_b;
+        model.audio_enc_layer_norm.w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
+        model.audio_enc_layer_norm.b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
+        model.tensors["model/enc/layer_norm/w"] = model.audio_enc_layer_norm.w;
+        model.tensors["model/enc/layer_norm/b"] = model.audio_enc_layer_norm.b;
 
-        model.layer_norm_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_feat_dim);
-        model.layer_norm_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_feat_dim);
-        model.tensors["model/layer_norm/w"] = model.layer_norm_w;
-        model.tensors["model/layer_norm/b"] = model.layer_norm_b;
+        model.layer_norm.w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_feat_dim);
+        model.layer_norm.b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_feat_dim);
+        model.tensors["model/layer_norm/w"] = model.layer_norm.w;
+        model.tensors["model/layer_norm/b"] = model.layer_norm.b;
 
         
 
         for (int i = 0; i < n_audio_enc_layer; ++i) {
             auto & layer = model.audio_enc_layers[i];
 
-            layer.self_attn_layer_norm_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
-            layer.self_attn_layer_norm_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
+            layer.self_attn_layer_norm.w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
+            layer.self_attn_layer_norm.b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
 
             layer.self_attn_linear_k_w   = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_audio_enc_dim, n_audio_enc_dim);
             layer.self_attn_linear_k_b   = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
@@ -309,8 +308,8 @@ extern "C" bool unity_model_load(const char* fname, unity_model& model, gpt_voca
             layer.self_attn_pos_bias_u = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_audio_enc_dim / n_audio_enc_head, n_audio_enc_head);
             layer.self_attn_pos_bias_v = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_audio_enc_dim / n_audio_enc_head, n_audio_enc_head);
 
-            layer.conv_layer_norm_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
-            layer.conv_layer_norm_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
+            layer.conv_layer_norm.w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
+            layer.conv_layer_norm.b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
 
             layer.conv_pointwise_conv1_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_audio_enc_dim, 2*n_audio_enc_dim);
             layer.conv_depthwise_conv_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 31, n_audio_enc_dim);
@@ -323,8 +322,8 @@ extern "C" bool unity_model_load(const char* fname, unity_model& model, gpt_voca
 
             layer.conv_pointwise_conv2_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_audio_enc_dim, n_audio_enc_dim);
 
-            layer.ffn1_layer_norm_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
-            layer.ffn1_layer_norm_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
+            layer.ffn1_layer_norm.w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
+            layer.ffn1_layer_norm.b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
 
             layer.ffn1_w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_audio_enc_dim, n_audio_enc_ffn_dim);
             layer.ffn1_b1 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_ffn_dim);
@@ -332,8 +331,8 @@ extern "C" bool unity_model_load(const char* fname, unity_model& model, gpt_voca
             layer.ffn1_w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_audio_enc_ffn_dim, n_audio_enc_dim);
             layer.ffn1_b2 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
 
-            layer.ffn2_layer_norm_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
-            layer.ffn2_layer_norm_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
+            layer.ffn2_layer_norm.w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
+            layer.ffn2_layer_norm.b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
 
             layer.ffn2_w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_audio_enc_dim, n_audio_enc_ffn_dim);
             layer.ffn2_b1 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_ffn_dim);
@@ -341,13 +340,13 @@ extern "C" bool unity_model_load(const char* fname, unity_model& model, gpt_voca
             layer.ffn2_w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_audio_enc_ffn_dim, n_audio_enc_dim);
             layer.ffn2_b2 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
 
-            layer.final_layer_norm_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
-            layer.final_layer_norm_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
+            layer.final_layer_norm.w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
+            layer.final_layer_norm.b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
 
             // map by name
 
-            model.tensors["model/enc/h" + std::to_string(i) + "/self_attn_layer_norm/w"] = layer.self_attn_layer_norm_w;
-            model.tensors["model/enc/h" + std::to_string(i) + "/self_attn_layer_norm/b"] = layer.self_attn_layer_norm_b;
+            model.tensors["model/enc/h" + std::to_string(i) + "/self_attn_layer_norm/w"] = layer.self_attn_layer_norm.w;
+            model.tensors["model/enc/h" + std::to_string(i) + "/self_attn_layer_norm/b"] = layer.self_attn_layer_norm.b;
 
             model.tensors["model/enc/h" + std::to_string(i) + "/self_attn_linear_k/w"] = layer.self_attn_linear_k_w;
             model.tensors["model/enc/h" + std::to_string(i) + "/self_attn_linear_k/b"] = layer.self_attn_linear_k_b;
@@ -361,8 +360,8 @@ extern "C" bool unity_model_load(const char* fname, unity_model& model, gpt_voca
             model.tensors["model/enc/h" + std::to_string(i) + "/self_attn_pos_bias/u"] = layer.self_attn_pos_bias_u;
             model.tensors["model/enc/h" + std::to_string(i) + "/self_attn_pos_bias/v"] = layer.self_attn_pos_bias_v;
 
-            model.tensors["model/enc/h" + std::to_string(i) + "/conv_layer_norm/w"]        = layer.conv_layer_norm_w;
-            model.tensors["model/enc/h" + std::to_string(i) + "/conv_layer_norm/b"]        = layer.conv_layer_norm_b;
+            model.tensors["model/enc/h" + std::to_string(i) + "/conv_layer_norm/w"]        = layer.conv_layer_norm.w;
+            model.tensors["model/enc/h" + std::to_string(i) + "/conv_layer_norm/b"]        = layer.conv_layer_norm.b;
 
             model.tensors["model/enc/h" + std::to_string(i) + "/conv_pointwise_conv1/w"] = layer.conv_pointwise_conv1_w;
             model.tensors["model/enc/h" + std::to_string(i) + "/conv_depthwise_conv/w"] = layer.conv_depthwise_conv_w;
@@ -373,22 +372,22 @@ extern "C" bool unity_model_load(const char* fname, unity_model& model, gpt_voca
             model.tensors["model/enc/h" + std::to_string(i) + "/conv_batch_norm/n"] = layer.conv_batch_norm_num_batches_tracked;
             model.tensors["model/enc/h" + std::to_string(i) + "/conv_pointwise_conv2/w"] = layer.conv_pointwise_conv2_w;
 
-            model.tensors["model/enc/h" + std::to_string(i) + "/ffn1_layer_norm/w"] = layer.ffn1_layer_norm_w;
-            model.tensors["model/enc/h" + std::to_string(i) + "/ffn1_layer_norm/b"] = layer.ffn1_layer_norm_b;
+            model.tensors["model/enc/h" + std::to_string(i) + "/ffn1_layer_norm/w"] = layer.ffn1_layer_norm.w;
+            model.tensors["model/enc/h" + std::to_string(i) + "/ffn1_layer_norm/b"] = layer.ffn1_layer_norm.b;
             model.tensors["model/enc/h" + std::to_string(i) + "/ffn1_w_1/w"] = layer.ffn1_w1;
             model.tensors["model/enc/h" + std::to_string(i) + "/ffn1_w_1/b"] = layer.ffn1_b1;
             model.tensors["model/enc/h" + std::to_string(i) + "/ffn1_w_2/w"] = layer.ffn1_w2;
             model.tensors["model/enc/h" + std::to_string(i) + "/ffn1_w_2/b"] = layer.ffn1_b2;
 
-            model.tensors["model/enc/h" + std::to_string(i) + "/ffn2_layer_norm/w"] = layer.ffn2_layer_norm_w;
-            model.tensors["model/enc/h" + std::to_string(i) + "/ffn2_layer_norm/b"] = layer.ffn2_layer_norm_b;
+            model.tensors["model/enc/h" + std::to_string(i) + "/ffn2_layer_norm/w"] = layer.ffn2_layer_norm.w;
+            model.tensors["model/enc/h" + std::to_string(i) + "/ffn2_layer_norm/b"] = layer.ffn2_layer_norm.b;
             model.tensors["model/enc/h" + std::to_string(i) + "/ffn2_w_1/w"] = layer.ffn2_w1;
             model.tensors["model/enc/h" + std::to_string(i) + "/ffn2_w_1/b"] = layer.ffn2_b1;
             model.tensors["model/enc/h" + std::to_string(i) + "/ffn2_w_2/w"] = layer.ffn2_w2;
             model.tensors["model/enc/h" + std::to_string(i) + "/ffn2_w_2/b"] = layer.ffn2_b2;
 
-            model.tensors["model/enc/h" + std::to_string(i) + "/final_layer_norm/w"] = layer.final_layer_norm_w;
-            model.tensors["model/enc/h" + std::to_string(i) + "/final_layer_norm/b"] = layer.final_layer_norm_b;
+            model.tensors["model/enc/h" + std::to_string(i) + "/final_layer_norm/w"] = layer.final_layer_norm.w;
+            model.tensors["model/enc/h" + std::to_string(i) + "/final_layer_norm/b"] = layer.final_layer_norm.b;
         }
     }
 
@@ -468,6 +467,21 @@ extern "C" bool unity_model_load(const char* fname, unity_model& model, gpt_voca
     return true;
 }
 
+extern "C" ggml_tensor* unity_layer_norm(
+    ggml_context* ctx,
+    ggml_tensor* cur,
+    const layer_norm_layer& layer,
+    const unity_hparams& hparams
+) {
+    cur = ggml_norm(ctx, cur, hparams.eps);
+    return ggml_add(
+        ctx,
+        ggml_mul(ctx, ggml_repeat(ctx, layer.w, cur), cur),
+        ggml_repeat(ctx, layer.b, cur)
+    );
+}
+
+
 // build the computation graph
 extern "C" ggml_cgraph* unity_audio_encoder_graph(
     const unity_model & model,
@@ -508,9 +522,9 @@ extern "C" ggml_cgraph* unity_audio_encoder_graph(
         cur = ggml_norm(ctx0, cur, hparams.eps);
         cur = ggml_add(ctx0,
                 ggml_mul(ctx0,
-                    ggml_repeat(ctx0, layer.ffn1_layer_norm_w, cur),
+                    ggml_repeat(ctx0, layer.ffn1_layer_norm.w, cur),
                     cur),
-                ggml_repeat(ctx0, layer.ffn1_layer_norm_b, cur));
+                ggml_repeat(ctx0, layer.ffn1_layer_norm.b, cur));
         // FFN1: proj
         cur = ggml_mul_mat(ctx0, layer.ffn1_w1, cur);
         cur = ggml_add(ctx0, ggml_repeat(ctx0, layer.ffn1_b1, cur), cur);
@@ -525,12 +539,13 @@ extern "C" ggml_cgraph* unity_audio_encoder_graph(
         // TODO: Opportunity to optimize attn calculation (1) For num_threads > 1 (2) Flash attn. See https://github.com/ggerganov/ggml/blob/main/examples/gpt-2/main.cpp 
 
         // self_attn: layernorm
+        // unity_layer_norm
         cur = ggml_norm(ctx0, cur, hparams.eps);
         cur = ggml_add(ctx0,
                 ggml_mul(ctx0,
-                    ggml_repeat(ctx0, layer.self_attn_layer_norm_w, cur),
+                    ggml_repeat(ctx0, layer.self_attn_layer_norm.w, cur),
                     cur),
-                ggml_repeat(ctx0, layer.self_attn_layer_norm_b, cur));
+                ggml_repeat(ctx0, layer.self_attn_layer_norm.b, cur));
         
         // self_attn: qkv
         struct ggml_tensor * Qcur = ggml_mul_mat(ctx0,