1 год назад · 8acb4fe8fb
--- a/ggml/examples/unity/fairseq2.cpp
+++ b/ggml/examples/unity/fairseq2.cpp
@@ -58,7 +58,6 @@ extern "C" void fairseq2_kv_cache_alloc(fairseq2_model& model, ggml_context* kv_
 
				     // Note: we only allocate the masks, proper kv cache allocation is delayed.
			
 
				     GGML_ASSERT(kv_cache_ctx);
			
 
				     GGML_ASSERT(!ggml_get_no_alloc(kv_cache_ctx));  // We need to be able to alloc the kv_cache buffers
			
 
				-    model.kv_cache_ctx = kv_cache_ctx;
			
 
				     auto attn_glob = "text_decoder.*_attn.k_proj.weight";
			
 
				     FORCE_ALLOC(self_attn_mask, kv_cache_ctx, ggml_new_tensor_2d(kv_cache_ctx, GGML_TYPE_F32, max_seq_len, max_seq_len));
			
 
				     self_attn_mask = ggml_diag_mask_inf_inplace(kv_cache_ctx, self_attn_mask, 0);
			
@@ -109,7 +108,7 @@ inline ggml_tensor* ggml_unsqueeze(ggml_context* ctx, ggml_tensor* x, int dim) {
 
				 void append_to_prev_kv(const fairseq2_model& model, const std::string& prefix, ggml_tensor** k, ggml_tensor** v, ggml_tensor** self_attn_mask) {
			
 
				     KeyValueTensor& kv = model.kv_cache[prefix];
			
 
				     int step_nr = kv.step_nr;
			
 
				-    ggml_context* ctx = model.kv_cache_ctx ? model.kv_cache_ctx : model.ctx;
			
 
				+    ggml_context* ctx = model.ctx;
			
 
				     // We need to force allocation here, otherwise the kv_cache buffers can be reused
			
 
				     bool no_alloc_save = ggml_get_no_alloc(ctx);
			
 
				     ggml_set_no_alloc(ctx, false);
			
@@ -434,7 +433,7 @@ extern "C" ggml_tensor* MultiheadAttention_forward(
 
				             if (kv_cache.step_nr == 0) {
			
 
				                 // If possible we use the ctx dedicated to kv_cache here,
			
 
				                 // because the enc dec attention is typically long lived.
			
 
				-                if (model.kv_cache_ctx) model.ctx = model.kv_cache_ctx;
			
 
				+                if (model.enc_kv_cache_ctx) model.ctx = model.enc_kv_cache_ctx;
			
 
				                 k = Linear_forward(model, prefix + ".k_proj", keys);
			
 
				                 ggml_set_name(k, "k");
			
 
				                 v = Linear_forward(model, prefix + ".v_proj", values);
			
@@ -1202,24 +1201,15 @@ void _bootstrap_seqs_and_scores(
 
				     int max_seq_len = scores->ne[0];
			
 
				     int beam_size = scores->ne[1];
			
 
				     GGML_ASSERT(prefix_seq_len > 0);
			
 
				+    ggml_context* ctx = model.ctx;
			
 
				     if (prefix_seq_len == 1) {
			
 
				-        // bootstrap all beams in full_seqs with EOS
			
 
				-        // This is equivalent to:
			
 
				-        // // full_seqs[:, : prefix_seq_len] = job.prefix_seq;
			
 
				-        // because in normal case: prefix_seq[0] = EOS
			
 
				-        // 
			
 
				-        int eos_id = model.vocab.token_to_id["</s>"];
			
 
				-        if (model.tgt_vocab.id_to_token.size()) {
			
 
				-            eos_id = model.tgt_vocab.token_to_id["</s>"];
			
 
				-        }
			
 
				-        size_t vocab_size = model.tensors["text_decoder_frontend.embed.weight"]->ne[1];
			
 
				-        for (int k = 0; k < beam_size; k++) {
			
 
				-            ggml_set_i32_1d(full_seqs, k * vocab_size, eos_id);
			
 
				-        }
			
 
				+        // We only have one token in prefix, we won't compute decoding scores,
			
 
				+        // we just need to copy the token to seqs.
			
 
				+        // Note: it also means the enc_kv_cache will be populated later.
			
 
				+        ggml_tensor* seqs = ggml_slice(ctx, full_seqs, 0, 0, prefix_seq_len);
			
 
				+        ggml_set_i32(seqs, ggml_get_i32_1d(job.prefix_seq, 0));
			
 
				         return;
			
 
				     }
			
 
				-        
			
 
				-    ggml_context* ctx = model.ctx;
			
 
				 
			
 
				     // full_seqs[:, : prefix_seq_len] = job.prefix_seq;
			
 
				     ggml_tensor* seqs = ggml_slice(ctx, full_seqs, 0, 0, prefix_seq_len);
			
@@ -1473,13 +1463,13 @@ extern "C" Hypothesis* generate_sequence(
 
				 
			
 
				     int prefix_seq_len = job.prefix_seq->ne[0];
			
 
				     int start_step = prefix_seq_len - 1;
			
 
				-    ggml_context* prev_step_ctx = ctx_from_buffer(local_bufs[(start_step - 1) % 2]);
			
 
				-    ggml_context* step_ctx = ctx_from_buffer(local_bufs[start_step % 2]);	    
			
 
				+    ggml_context* prev_step_ctx = ctx_from_buffer(local_bufs[(start_step + 1) % 2]);
			
 
				+    ggml_context* step_ctx = ctx_from_buffer(local_bufs[start_step % 2]);
			
 
				     GGML_ASSERT(step_ctx != search_ctx);
			
 
				     GGML_ASSERT(prev_step_ctx != step_ctx);
			
 
				     model.ctx = prev_step_ctx;
			
 
				     // search_ctx because we need encoder_decoder_attn.k_cache to survive for the full search
			
 
				-    model.kv_cache_ctx = search_ctx;
			
 
				+    model.enc_kv_cache_ctx = search_ctx;
			
 
				     ggml_tensor* lid_scores;
			
 
				     if (lang_ids.size()) {
			
 
				         lid_scores = ggml_new_tensor_1d(result_ctx, GGML_TYPE_F32, lang_ids.size());
			
@@ -1490,9 +1480,6 @@ extern "C" Hypothesis* generate_sequence(
 
				     );
			
 
				     printf("Seqs dim after bootstrapping: [%d %d %d]\n", seqs->ne[0], seqs->ne[1], seqs->ne[2]);
			
 
				 
			
 
				-    // Now we will only add self_attn.k_cache and those need to be resorted and copied at every step.
			
 
				-    model.kv_cache_ctx = nullptr;
			
 
				-
			
 
				     // Holds the indices of beams (a beam can occur more than once) that we
			
 
				     // should continue with in the next step.
			
 
				     ggml_tensor* beam_indices = ggml_new_tensor_1d(search_ctx, GGML_TYPE_I32, beam_size);
			
--- a/ggml/examples/unity/fairseq2.h
+++ b/ggml/examples/unity/fairseq2.h
@@ -110,7 +110,7 @@ struct fairseq2_model {
 
				     // TODO: is this the best place to store this or should we also pass this to all forward methods ?
			
 
				     ggml_context* ctx = nullptr;
			
 
				 
			
 
				-    ggml_context* kv_cache_ctx = nullptr;
			
 
				+    ggml_context* enc_kv_cache_ctx = nullptr;
			
 
				 };
			
 
				 
			
 
				 double fairseq2_model_layer_config_double(const fairseq2_model& model, std::string name);