2 years ago · bfbafd9603
--- a/ggml/examples/unity/fairseq2.cpp
+++ b/ggml/examples/unity/fairseq2.cpp
@@ -629,7 +629,7 @@ int StandardBeamSearch_step(
 
				     } else {
			
 
				         // Make probabilities contain cumulative scores for each hypothesis.
			
 
				         // TODO this seems incorrect
			
 
				-        lprobs = ggml_add_inplace(ctx, lprobs, ggml_repeat(ctx, last_scores, lprobs));
			
 
				+        lprobs = ggml_add(ctx, lprobs, ggml_repeat(ctx, last_scores, lprobs));
			
 
				     }
			
 
				 
			
 
				     ggml_cgraph gf = ggml_build_forward(lprobs);
			
@@ -650,6 +650,13 @@ int StandardBeamSearch_step(
 
				     return topk;
			
 
				 }
			
 
				 
			
 
				+
			
 
				+void ggml_detach(ggml_tensor* a) {
			
 
				+    a->op = GGML_OP_NONE;
			
 
				+    a->src[0] = nullptr;
			
 
				+}
			
 
				+
			
 
				+
			
 
				 int _finalize_hypothesis(
			
 
				     const SequenceGeneratorJob& job,
			
 
				     ggml_context* ctx,
			
@@ -667,9 +674,6 @@ int _finalize_hypothesis(
 
				     // Detect beams that reached the minimum length and that end with an EOS.
			
 
				     bool eos = token == job.eos_idx;
			
 
				     eos &= tok_score != -INFINITY;
			
 
				-    // TODO ignored_beam_mask ?
			
 
				-    // eos &= ggml_get_i32_1d(ignored_beam_mask, beam);
			
 
				-    // ggml_set_i32_1d(eos_mask, beam, eos);
			
 
				 
			
 
				     if (!eos) return 0;
			
 
				 
			
@@ -696,18 +700,20 @@ int _finalize_hypothesis(
 
				         // Skip first EOS since it is always 0 and skews normalization.
			
 
				         tok_score /= (float)std::pow((step_nr + 1), job.opts.len_penalty);
			
 
				 
			
 
				+    // TODO the score computed here isn't the same than computed by fairseq2.
			
 
				     hypotheses.emplace_back(Hypothesis{tokens, tok_score, step_scores});
			
 
				     return 1;
			
 
				 }
			
 
				 
			
 
				 /// Generates a translation for a single sequence
			
 
				 // TODO: finish this for beam_size=1
			
 
				-// * implement the lprobs tweaking
			
 
				+// * find out why score is different (seq is the same though)
			
 
				 // TODO: add IncrementalStateBag support to avoid a O(N^3) generation.
			
 
				 // TODO: support beam_size > 1:
			
 
				 // * most layers assume un-batched input, but we want to handle several beams at once
			
 
				 // * need to port "reorder_state_dict"
			
 
				-// * once beam are selected with topk, we need to update seqs and scores tensors
			
 
				+// TODO: clean up
			
 
				+// * replace manual tensor tweaking with ggml_set_*d (ggml_set_slice could be useful)
			
 
				 extern "C" float generate_sequence(
			
 
				     fairseq2_model& model,
			
 
				     const SequenceGeneratorJob& job,
			
@@ -805,6 +811,7 @@ extern "C" float generate_sequence(
 
				         ggml_cgraph gf = ggml_build_forward(lprobs);
			
 
				         printf("beam search step %d. Graph.n_nodes: %d\n", step_nr, gf.n_nodes);
			
 
				         ggml_graph_compute_with_ctx(ctx, &gf, 1);
			
 
				+        ggml_detach(lprobs);
			
 
				 
			
 
				         // // Do not allow EOS before reaching the minimum sequence length.
			
 
				         if (step_nr < job.opts.min_seq_len) {
			
@@ -814,7 +821,7 @@ extern "C" float generate_sequence(
 
				         }
			
 
				 
			
 
				         // If we have reached the maximum length, force the last step to be EOS.
			
 
				-        // TODO: should this be done in an hadoc loop ? how often does that happen anyway ?
			
 
				+        // TODO: should this be done in an adhoc loop ? how often does that happen anyway ?
			
 
				         if (step_nr == max_seq_len - 2) {
			
 
				             // lprobs[:, :, : self.eos_idx]       = -torch.inf
			
 
				             // lprobs[:, :,   self.eos_idx + 1 :] = -torch.inf
			
@@ -832,9 +839,14 @@ extern "C" float generate_sequence(
 
				         for (size_t i = 0; i < beam_size; ++i)
			
 
				             ggml_set_f32_1d(lprobs, vocab_size * i + pad_idx, -INFINITY);
			
 
				 
			
 
				-        // // Apply UNK penalty.
			
 
				-        // if self.unk_idx is not None:
			
 
				-        //     lprobs[:, :, self.unk_idx] -= self.opts.unk_penalty
			
 
				+        // Apply UNK penalty.
			
 
				+        if (job.unk_idx >= 0 && job.opts.unk_penalty != 0) {
			
 
				+            // lprobs[:, :, self.unk_idx] -= self.opts.unk_penalty
			
 
				+            auto lprobs_raw = ggml_get_data_f32(lprobs);
			
 
				+            for (size_t i = 0; i < beam_size; ++i)
			
 
				+                lprobs_raw[vocab_size * i + job.unk_idx] -= job.opts.unk_penalty;
			
 
				+        }
			
 
				+
			
 
				 
			
 
				         // Determine candidates for the next step.
			
 
				         // (N, 2 x B)
			
@@ -871,24 +883,24 @@ extern "C" float generate_sequence(
 
				         // Reorder beams in the `seq` and `score` buffers. The same beam can
			
 
				         // be selected more than once.
			
 
				         ggml_tensor* new_seqs = seqs;
			
 
				+        // ggml_get_rows and ggml_set only work with floats ...
			
 
				+        new_seqs->type = GGML_TYPE_F32;
			
 
				         ggml_tensor* new_scores = scores;
			
 
				         if (step_nr > start_step) {
			
 
				             // (B, S), (B) -> (B, S)
			
 
				-            // ggml_get_rows only work with floats ...
			
 
				-            new_seqs->type = GGML_TYPE_F32;
			
 
				             new_seqs = ggml_get_rows(ctx, seqs, beam_indices);
			
 
				             new_scores = ggml_get_rows(ctx, new_scores, beam_indices);
			
 
				         }
			
 
				 
			
 
				         // new_seqs[:, step_nr + 1] = next_tokens
			
 
				-        ggml_set_1d_inplace(ctx, new_seqs, next_tokens, new_seqs->nb[0] * (step_nr + 1));
			
 
				-        ggml_set_1d_inplace(ctx, new_scores, next_scores, new_scores->nb[0] * (step_nr + 1));
			
 
				-
			
 
				-        gf = ggml_build_forward(new_seqs);
			
 
				+        gf = ggml_build_forward(ggml_set_1d_inplace(ctx, new_seqs, next_tokens, new_seqs->nb[0] * (step_nr + 1)));
			
 
				         ggml_graph_compute_with_ctx(ctx, &gf, 1);
			
 
				+        ggml_detach(new_seqs);
			
 
				         new_seqs->type = GGML_TYPE_I32;
			
 
				-        gf = ggml_build_forward(new_scores);
			
 
				+
			
 
				+        gf = ggml_build_forward(ggml_set_1d_inplace(ctx, new_scores, next_scores, new_scores->nb[0] * (step_nr + 1)));
			
 
				         ggml_graph_compute_with_ctx(ctx, &gf, 1);
			
 
				+        ggml_detach(new_scores);
			
 
				 
			
 
				         // TODO the old seqs and score buffers could be reused for next step
			
 
				         seqs = new_seqs;
			
--- a/ggml/examples/unity/fairseq2.h
+++ b/ggml/examples/unity/fairseq2.h
@@ -118,8 +118,10 @@ struct SequenceGeneratorOptions {
 
				 struct SequenceGeneratorJob {
			
 
				     SequenceGeneratorOptions opts;
			
 
				     ggml_tensor* prefix_seq;
			
 
				-    std::int32_t eos_idx;
			
 
				     std::int32_t pad_idx;
			
 
				+    std::int32_t unk_idx;
			
 
				+    std::int32_t bos_idx;
			
 
				+    std::int32_t eos_idx;
			
 
				 };
			
 
				 
			
 
				 
			
--- a/ggml/ggml.py
+++ b/ggml/ggml.py
@@ -318,8 +318,10 @@ class SequenceGeneratorOptions:
 
				 class SequenceGeneratorJob:
			
 
				     opts: SequenceGeneratorOptions
			
 
				     prefix_seq: Ptr[ggml_tensor]
			
 
				-    eos_idx: int
			
 
				     pad_idx: int
			
 
				+    unk_idx: int
			
 
				+    bos_idx: int
			
 
				+    eos_idx: int
			
 
				 
			
 
				 
			
 
				 @c_fn(lib)
			
--- a/ggml/test_unity_cpp.py
+++ b/ggml/test_unity_cpp.py
@@ -686,8 +686,10 @@ def test_t2tt(ctx: Ctx, g_model: c_void_p):
 
				     job.opts.len_penalty = 1.0
			
 
				     job.opts.unk_penalty = 0.0
			
 
				     job.prefix_seq = ggml.from_numpy(ctx, text_out["tgt_tokens"].astype(np.int32)[:2])
			
 
				+    job.pad_idx = 0
			
 
				+    job.unk_idx = 1
			
 
				+    job.bos_idx = 2
			
 
				     job.eos_idx = 3
			
 
				-    job.pad_idx = 1
			
 
				 
			
 
				     result = ggml.ggml_tensor()
			
 
				     g_score = ggml.generate_sequence(