2 ani în urmă · e38442d1f0
--- a/ggml/examples/unity/fairseq2.cpp
+++ b/ggml/examples/unity/fairseq2.cpp
@@ -175,7 +175,7 @@ extern "C" ggml_tensor* MultiheadAttention_forward(
 
				     ggml_tensor* queries,  // (slen, d_in)
			
 
				     ggml_tensor* keys,  // (klen, d_in)
			
 
				     ggml_tensor* values,  // (klen, d_out)
			
 
				-    ggml_tensor* mask // (klen, slen)
			
 
				+    ggml_tensor* attn_mask // (klen, slen)
			
 
				 ) {
			
 
				     int model_dim = queries->ne[0];
			
 
				     int num_heads = 16;  // TODO: read from hparams
			
@@ -197,7 +197,7 @@ extern "C" ggml_tensor* MultiheadAttention_forward(
 
				 
			
 
				 #if UNITY_FLASH_ATTN
			
 
				     // For flash_attn, we assume either no masks, or triangular masks.
			
 
				-    ggml_tensor* attn = ggml_flash_attn(ctx, q, k, v, /*masked*/mask != nullptr);  // (B * H, S, H_dim)
			
 
				+    ggml_tensor* attn = ggml_flash_attn(ctx, q, k, v, /*masked*/attn_mask != nullptr);  // (B * H, S, H_dim)
			
 
				     ggml_set_name(attn, "attn");
			
 
				     // TODO test !
			
 
				     attn = ggml_unflatten_1d(ctx, attn, 2, num_heads);  // (B, H, H_dim, S)
			
@@ -212,8 +212,7 @@ extern "C" ggml_tensor* MultiheadAttention_forward(
 
				     ggml_set_name(qk, "qk_scaled");
			
 
				 
			
 
				     // TODO: Should we replace this by ggml_diag_mask_inf ?
			
 
				-    // TODO: masks have the wrong shape to be added here
			
 
				-    if (mask) qk = ggml_add(ctx, qk, mask);
			
 
				+    if (attn_mask) qk = ggml_add(ctx, qk, attn_mask);
			
 
				     // TODO: upgrade qk to float32 if needed
			
 
				     ggml_tensor* attn_weights = ggml_soft_max(ctx, qk);  // (B * H, S, Sk)
			
 
				     ggml_set_name(attn_weights, "attn_weights");
			
@@ -257,7 +256,7 @@ extern "C" ggml_tensor* StandardTransformerEncoderLayer_forward(
 
				         seqs,
			
 
				         seqs,
			
 
				         seqs,
			
 
				-        /*attention masks=*/nullptr
			
 
				+        /*attn_mask=*/nullptr
			
 
				     );
			
 
				 
			
 
				     if (has_layer(model, prefix + ".self_attn_norm"))
			
@@ -415,7 +414,7 @@ extern "C" ggml_tensor* StandardTransformerDecoderLayer_forward(
 
				         seqs,
			
 
				         seqs,
			
 
				         seqs,
			
 
				-        /*attention masks=*/self_attn_mask
			
 
				+        /*attn_mask=*/self_attn_mask
			
 
				     );
			
 
				 
			
 
				     if (has_layer(model, prefix + ".self_attn_norm"))
			
@@ -617,8 +616,7 @@ void _bootstrap_seqs_and_scores(
 
				         seqs,
			
 
				         /*padding_mask*/ nullptr,
			
 
				         encoder_output,
			
 
				-        // we assume there is only one input, and therefore we don't need padding.
			
 
				-        /*encoder_padding_mask*/ nullptr
			
 
				+        encoder_padding_mask
			
 
				         // TODO: state_bag
			
 
				     );
			
 
				     // TODO state_bag.increment_step(prefix_seq_len - 1)
			
@@ -684,12 +682,14 @@ void _finalize_hypothesis(
 
				     float eos_score,
			
 
				     ggml_tensor* seqs, // (beam_size, seq_len)
			
 
				     ggml_tensor* scores, // (beam_size, seq_len)
			
 
				-    std::vector<Hypothesis>& hypotheses
			
 
				+    Hypothesis* hypothesis
			
 
				 ) {
			
 
				-    ggml_tensor* tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, step_nr + 2);
			
 
				+    ggml_tensor* seq = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, step_nr + 2);
			
 
				+    hypothesis->seq = seq;
			
 
				     ggml_tensor* step_scores = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, step_nr + 2);
			
 
				+    hypothesis->step_scores = step_scores;
			
 
				 
			
 
				-    auto tok = (std::int32_t*)tokens->data;
			
 
				+    auto tok = (std::int32_t*)seq->data;
			
 
				     for (int i = 0; i < step_nr + 1; ++i) {
			
 
				         tok[i] = ggml_get_i32_1d(seqs, seqs->ne[0] * beam + i);
			
 
				     }
			
@@ -708,8 +708,7 @@ void _finalize_hypothesis(
 
				     if (job.opts.normalize_scores)
			
 
				         // Skip first EOS since it is always 0 and skews normalization.
			
 
				         eos_score /= (float)std::pow((step_nr + 1), job.opts.len_penalty);
			
 
				-
			
 
				-    hypotheses.emplace_back(Hypothesis{tokens, eos_score, step_scores});
			
 
				+    hypothesis->score = eos_score;
			
 
				 }
			
 
				 
			
 
				 // Uses ggml_context to store any object.
			
@@ -741,8 +740,11 @@ extern "C" Hypothesis* generate_sequence(
 
				     // (S_enc, M) -> (B, S_enc, M)
			
 
				     _fan_out_encoder_output(ctx, &encoder_output, &encoder_padding_mask, beam_size);
			
 
				 
			
 
				-    std::vector<Hypothesis> finished_searches;
			
 
				-    finished_searches.reserve(beam_size);
			
 
				+    // Allocate results in the context provided by the caller.
			
 
				+    Hypothesis* finished_searches_begin = GGML_CTX_ALLOC(result_ctx, Hypothesis, beam_size);
			
 
				+    Hypothesis* finished_searches = finished_searches_begin;
			
 
				+    for (std::size_t i = 0; i < beam_size; ++i) finished_searches[i] = {nullptr, -INFINITY, nullptr};
			
 
				+    Hypothesis* finished_searches_end = finished_searches + beam_size;
			
 
				 
			
 
				     // Initialize buffers. (B, S)
			
 
				     ggml_tensor* seqs = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, max_seq_len, beam_size);
			
@@ -770,9 +772,7 @@ extern "C" Hypothesis* generate_sequence(
 
				     for (std::size_t i = 0; i < vocab_size * beam_size; ++i)
			
 
				         ((int32_t *)(candidate_indices->data))[i] = i;
			
 
				 
			
 
				-    // TODO: memory management
			
 
				-    // there should be a per-step ggml_context for intermediary results
			
 
				-    // start of beam search:
			
 
				+    // TODO: memory management, there should be a per-step ggml_context for intermediary results
			
 
				     for (int step_nr = start_step; step_nr < max_seq_len - 1; ++step_nr) {
			
 
				         // because of no IncrementalStateBag we pass input from the start
			
 
				         // decoder_input = seqs[:, 0 : step_nr + 1]
			
@@ -784,7 +784,7 @@ extern "C" Hypothesis* generate_sequence(
 
				             decoder_input,
			
 
				             nullptr,  // We never generate PAD.
			
 
				             encoder_output,
			
 
				-            /*encoder_padding_mask*/ nullptr // TODO: do we need padding for encoder ?
			
 
				+            encoder_padding_mask
			
 
				             // state_bag=state_bag,
			
 
				         ); // (B, S, D)
			
 
				 
			
@@ -873,8 +873,8 @@ extern "C" Hypothesis* generate_sequence(
 
				             bool eos = token == job.eos_idx;
			
 
				             eos &= tok_score != -INFINITY;
			
 
				             if (eos) {
			
 
				-                _finalize_hypothesis(job, result_ctx, step_nr, beam, token, tok_score, seqs, scores, finished_searches);
			
 
				-                if (finished_searches.size() >= beam_size)
			
 
				+                _finalize_hypothesis(job, result_ctx, step_nr, beam, token, tok_score, seqs, scores, finished_searches++);
			
 
				+                if (finished_searches == finished_searches_end)
			
 
				                     goto end_of_beam_search;
			
 
				                 continue;
			
 
				             }
			
@@ -921,20 +921,12 @@ extern "C" Hypothesis* generate_sequence(
 
				 end_of_beam_search:
			
 
				     // Ensure that hypotheses are sorted by decreasing scores before returning.
			
 
				     std::sort(
			
 
				-        finished_searches.begin(),
			
 
				-        finished_searches.end(),
			
 
				+        finished_searches_begin,
			
 
				+        finished_searches_end,
			
 
				         [](Hypothesis a, Hypothesis b) { return a.score > b.score; }
			
 
				     );
			
 
				 
			
 
				-    // Copy the scores to an object in the result_ctx.
			
 
				-    GGML_ASSERT(finished_searches.size() <= beam_size);
			
 
				-    Hypothesis* result = GGML_CTX_ALLOC(result_ctx, struct Hypothesis, beam_size);
			
 
				-    std::copy(finished_searches.begin(), finished_searches.end(), result);
			
 
				-    // In case we have less searches than expected, still make sure to initialize the memory.
			
 
				-    for (std::size_t i = finished_searches.size(); i < beam_size; ++i)
			
 
				-        result[i] = Hypothesis{nullptr, -INFINITY, nullptr};
			
 
				-
			
 
				-    return result;
			
 
				+    return finished_searches_begin;
			
 
				 }
			
 
				 
			
 
				 extern "C" Hypothesis* _testing_return_hypothesis_ptr(ggml_context* ctx) {
			
--- a/ggml/test_unity_cpp.py
+++ b/ggml/test_unity_cpp.py
@@ -432,7 +432,7 @@ def test_t2tt(ctx: Ctx, g_model: c_void_p):
 
				     result_ptr = ggml.generate_sequence(
			
 
				         g_model, job, encoder_out, encoder_padding_mask, ctx
			
 
				     )
			
 
				-    results = [result_ptr[i] for i in range(beam_size)]
			
 
				+    results = [result_ptr[i] for i in range(beam_size) if result_ptr[i].seq != None]
			
 
				 
			
 
				     assert len(results) == len(text_out["hypotheses"])
			
 
				     for g_hyp, exp in zip(results, text_out["hypotheses"]):