Selaa lähdekoodia

job.num_threads

# Conflicts:
#	ggml/examples/unity/fairseq2.cpp

# Conflicts:
#	ggml/examples/unity/fairseq2.cpp
Guillaume Wenzek 1 vuosi sitten
vanhempi
commit
ac4a2eada9
2 muutettua tiedostoa jossa 36 lisäystä ja 35 poistoa
  1. 35 35
      ggml/examples/unity/fairseq2.cpp
  2. 1 0
      ggml/examples/unity/fairseq2.h

+ 35 - 35
ggml/examples/unity/fairseq2.cpp

@@ -473,7 +473,7 @@ extern "C" ggml_tensor* StandardTransformerEncoderLayer_forward(
 extern "C" ggml_tensor* WaveformToFbank_forward(
     fairseq2_model& model,
     const std::string &prefix,
-    ggml_tensor* waveform 
+    ggml_tensor* waveform
 ) {
     // Hardcoding: num_bins 80, sample rate 16k, always standardize
     ggml_context* ctx = model.ctx;
@@ -486,7 +486,7 @@ extern "C" ggml_tensor* WaveformToFbank_forward(
     knf::FbankOptions opts{};
     opts.frame_opts = frame_opts;
     opts.mel_opts = mel_opts;
-    
+
 
     std::vector<float_t> signal_frame{};
     std::int32_t num_frames = knf::NumFrames(/*num_samples=*/waveform->ne[0], frame_opts);
@@ -538,13 +538,13 @@ extern "C" ggml_tensor* RelativePositionMHA_forward(
     struct ggml_tensor * Qcur = Linear_forward(model, prefix + ".q_proj", seqs);
     struct ggml_tensor * Kcur = Linear_forward(model, prefix + ".k_proj", seqs);
     struct ggml_tensor * Vcur = Linear_forward(model, prefix + ".v_proj", seqs);
-    
+
     // self_attn: rel_pos SDPA
     int32_t S = seqs->ne[1];
     int32_t H = 16; // TODO: Make this configurable
     int32_t n_ctx = 4096;
     int32_t K_h = seqs->ne[0] / H;
-    
+
     int32_t start_index = n_ctx - S;
     int32_t end_index = n_ctx + S - 1;
 
@@ -556,10 +556,10 @@ extern "C" ggml_tensor* RelativePositionMHA_forward(
     for (int i = 0; i < num_indices; i++) {
         ((int32_t *)rows->data)[i] = start_index + i;
     }
-    
+
     // self_attn: load pos_enc weights & compute_r
-    // In fairseq2 pos_enc weights are calculated on the fly, since some more custom operators might be needed to enable this, 
-    // we store the results (fixed) in checkpoint as model.audio_enc_pos_enc_w and load directly. 
+    // In fairseq2 pos_enc weights are calculated on the fly, since some more custom operators might be needed to enable this,
+    // we store the results (fixed) in checkpoint as model.audio_enc_pos_enc_w and load directly.
     struct ggml_tensor * r = ggml_get_rows(ctx, model.tensors["speech_encoder.pos_enc"], rows);
     r = ggml_mul_mat(ctx, model.tensors[prefix + ".sdpa.r_proj.weight"], r);
     r = ggml_dup(ctx, ggml_permute(ctx,
@@ -567,55 +567,55 @@ extern "C" ggml_tensor* RelativePositionMHA_forward(
                             r,
                             ggml_new_tensor_3d(ctx, GGML_TYPE_F32, K_h, H, S*2-1)),
                         0, 2, 1, 3));
-    
+
     struct ggml_tensor * u_bias = ggml_reshape_3d(ctx, model.tensors[prefix + ".sdpa.u_bias"], K_h, 1, H);
     struct ggml_tensor * v_bias = ggml_reshape_3d(ctx, model.tensors[prefix + ".sdpa.v_bias"], K_h, 1, H);
 
     // self_attn: Permute QKV
-    
+
     struct ggml_tensor * Q =
                 ggml_dup(ctx, ggml_permute(ctx,
                         ggml_cpy(ctx,
                             Qcur,
                             ggml_new_tensor_3d(ctx, GGML_TYPE_F32, K_h, H, S)),
                         0, 2, 1, 3)); // (H * K_h, S) -> (K_h, H, S) -> (K_h, S, H)
-    struct ggml_tensor * K = 
+    struct ggml_tensor * K =
                 ggml_dup(ctx, ggml_permute(ctx,
                         ggml_cpy(ctx,
                             Kcur,
                             ggml_new_tensor_3d(ctx, GGML_TYPE_F32, K_h, H, S)),
                         0, 2, 1, 3)); // (H * K_h, S) -> (K_h, H, S) -> (K_h, S, H)
-    struct ggml_tensor * V = 
+    struct ggml_tensor * V =
                 ggml_dup(ctx, ggml_permute(ctx,
                         ggml_cpy(ctx,
                             Vcur,
                             ggml_new_tensor_3d(ctx, GGML_TYPE_F32, K_h, H, S)),
                         1, 2, 0, 3)); // (H * K_h, S) -> (K_h, H, S) -> (H, S, K_h)
-    
-    
+
+
     struct ggml_tensor * q_with_u_bias = ggml_add(ctx, Q, u_bias); // (K_h, S, H)
     struct ggml_tensor * q_with_v_bias = ggml_add(ctx, Q, v_bias); // (K_h, S, H)
-    
+
     struct ggml_tensor * ac = ggml_mul_mat(ctx, K, q_with_u_bias);
     struct ggml_tensor * bd = ggml_mul_mat(ctx, r, q_with_v_bias);
-    
-    
+
+
     // self_attn: shift_bd. Logic follows https://github.com/facebookresearch/fairseq2/blob/main/src/fairseq2/nn/transformer/relative_attention.py#L161
     bd = ggml_dup(ctx, ggml_permute(ctx, bd, 2, 1, 0, 3)); // H, S, 2S-1
-    
+
     struct ggml_tensor * pad = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, H, S, 1);
     pad->data = malloc(ggml_nbytes(pad));
 
     pad = ggml_set_f32(pad, 0.0);
     bd = ggml_concat(ctx, pad, bd); // bd[i][j][0] == 0, (H, S, 2S)
-    bd = ggml_dup(ctx, ggml_permute(ctx, bd, 2, 1, 0, 3)); // (2S, S, H) 
+    bd = ggml_dup(ctx, ggml_permute(ctx, bd, 2, 1, 0, 3)); // (2S, S, H)
     bd = ggml_dup(ctx, ggml_reshape_3d(ctx, bd, S, 2*S, H));  // (S, 2S, H)
     bd = ggml_remove_head_row(ctx, bd); // A custom operator introduced to reduce 1st row (in the 2nd dim)
 
     bd = ggml_reshape_3d(ctx, bd, 2*S-1, S, H);
 
-    bd = ggml_get_first_cols_by_rows(ctx, bd); // A custom operator introduced to get first #rows cols. 
-    
+    bd = ggml_get_first_cols_by_rows(ctx, bd); // A custom operator introduced to get first #rows cols.
+
 
     // self_attn: compute attn / weights
     struct ggml_tensor * attn_weights = ggml_add(ctx, ac, bd);
@@ -624,11 +624,11 @@ extern "C" ggml_tensor* RelativePositionMHA_forward(
     ggml_set_f32(attn_scale, 1.0 / pow(K_h, 0.5));
     attn_weights = ggml_mul(ctx, ggml_repeat(ctx, attn_scale, attn_weights), attn_weights);
     attn_weights = ggml_soft_max(ctx, attn_weights);
-    
+
     struct ggml_tensor * attn = ggml_mul_mat(ctx, V, attn_weights); // K_h, S, H
     attn = ggml_dup(ctx, ggml_permute(ctx, attn, 0, 2, 1, 3));
-    struct ggml_tensor * attn_2d = ggml_reshape_2d(ctx, attn, K_h * H, S); 
-    
+    struct ggml_tensor * attn_2d = ggml_reshape_2d(ctx, attn, K_h * H, S);
+
     struct ggml_tensor * attn_out = ggml_mul_mat(ctx, model.tensors[prefix + ".output_proj.weight"], attn_2d);
     attn_out = ggml_add(ctx,
             ggml_repeat(ctx,
@@ -649,23 +649,23 @@ extern "C" ggml_tensor* ConvModule_forward(
         seqs = LayerNorm_forward(model, prefix + "_layer_norm", seqs);
         // conv: Use matmul for pointwise conv 1 - kernel_size=1, no padding case
         seqs = ggml_mul_mat(ctx, model.tensors[prefix + ".pointwise_conv1.weight"], seqs);
-        
+
         // conv: GLU
         seqs = ggml_glu(ctx, seqs);
         seqs = ggml_dup(ctx, ggml_permute(ctx, seqs, 1, 0, 2, 3));
-       
+
         // S x C -> (S+K-1) x C -> K x S x C -> S x C
         seqs = ggml_conv_1d(ctx, model.tensors[prefix + ".depthwise_conv.weight"], seqs, 1, 15, 1);
-        
+
         // conv: Custom implementation of batch norm
         seqs = ggml_batch_norm(ctx, seqs, model.tensors[prefix + ".batch_norm.weight"], model.tensors[prefix + ".batch_norm.bias"], model.tensors[prefix + ".batch_norm.running_mean"], model.tensors[prefix + ".batch_norm.running_var"], 1e-5);
-        
+
         // conv: SiLU actvation
         seqs = ggml_silu(ctx, seqs);
         seqs = ggml_dup(ctx, ggml_permute(ctx, seqs, 1, 0, 2, 3));
 
         // conv: Use matmul for pointwise conv 2 - kernel_size=1, no padding case
-        seqs = ggml_mul_mat(ctx, model.tensors[prefix + ".pointwise_conv2.weight"], seqs); 
+        seqs = ggml_mul_mat(ctx, model.tensors[prefix + ".pointwise_conv2.weight"], seqs);
 
         // conv: + residual
         seqs = ggml_add(ctx, seqs, residual);
@@ -709,9 +709,9 @@ extern "C" ggml_tensor* StandardConformerEncoder_forward(
     seqs = LayerNorm_forward(model, prefix + "_frontend.post_extract_layer_norm", seqs);
     seqs = Linear_forward(model, prefix + "_frontend.model_dim_proj", seqs);
     int layer_idx = 0;
-    
+
     std::string layer_name = prefix + ".inner.layers." + std::to_string(layer_idx);
-    
+
     while (has_layer(model, layer_name)) {
         seqs = StandardConformerEncoderLayer_forward(
             model, layer_name, seqs, padding_mask
@@ -742,7 +742,7 @@ extern "C" ggml_tensor* StandardConformerEncoder_forward(
         layer_name = prefix + ".adaptor_layers." + std::to_string(layer_idx);
     }
     seqs = LayerNorm_forward(model, prefix + ".layer_norm", seqs);
-    
+
     return seqs;
 }
 
@@ -756,7 +756,7 @@ extern "C" ggml_tensor* StandardConformerEncoderAdaptorLayer_forward(
     struct ggml_tensor * residual = seqs;
     residual = LayerNorm_forward(model, prefix + ".residual_layer_norm", residual);
     residual = ggml_dup(ctx, ggml_permute(ctx, residual, 1, 0, 2, 3));
-    residual = ggml_conv_1d_generic(ctx, model.tensors[prefix + ".residual_conv.weight"], residual, 8, 4, 1); 
+    residual = ggml_conv_1d_generic(ctx, model.tensors[prefix + ".residual_conv.weight"], residual, 8, 4, 1);
     residual = ggml_dup(ctx, ggml_permute(ctx, residual, 1, 0, 2, 3));
     residual = ggml_add(ctx, ggml_repeat(ctx, model.tensors[prefix + ".residual_conv.bias"], residual), residual);
     residual = ggml_glu(ctx, residual);
@@ -766,8 +766,8 @@ extern "C" ggml_tensor* StandardConformerEncoderAdaptorLayer_forward(
     seqs = ggml_conv_1d_generic(ctx, model.tensors[prefix + ".self_attn_conv.weight"], seqs, 8, 4, 1);
     seqs = ggml_dup(ctx, ggml_permute(ctx, seqs, 1, 0, 2, 3));
     seqs = ggml_add(ctx, ggml_repeat(ctx, model.tensors[prefix + ".self_attn_conv.bias"], seqs), seqs);
-    seqs = ggml_glu(ctx, seqs); 
-    
+    seqs = ggml_glu(ctx, seqs);
+
     seqs = MultiheadAttention_forward(
         model,
         prefix + ".self_attn",
@@ -1388,7 +1388,7 @@ extern "C" Hypothesis* generate_sequence(
             ggml_detach(new_scores);
             new_seqs->type = GGML_TYPE_I32;
         }
-        
+
         // new_seqs[:, step_nr + 1] = next_tokens
         // new_scores[:, step_nr + 1] = next_scores
         for (std::size_t i = 0; i < beam_size; ++i) {

+ 1 - 0
ggml/examples/unity/fairseq2.h

@@ -210,6 +210,7 @@ struct SequenceGeneratorJob {
     std::int32_t unk_idx;
     std::int32_t bos_idx;
     std::int32_t eos_idx;
+    std::int32_t num_threads;
 };
 
 /// Represents a hypothesis produced by a sequence generator.