Эх сурвалжийг харах

fix generation with beam_size=1

Guillaume Wenzek 1 жил өмнө
parent
commit
bfbafd9603

+ 29 - 17
ggml/examples/unity/fairseq2.cpp

@@ -629,7 +629,7 @@ int StandardBeamSearch_step(
     } else {
     } else {
         // Make probabilities contain cumulative scores for each hypothesis.
         // Make probabilities contain cumulative scores for each hypothesis.
         // TODO this seems incorrect
         // TODO this seems incorrect
-        lprobs = ggml_add_inplace(ctx, lprobs, ggml_repeat(ctx, last_scores, lprobs));
+        lprobs = ggml_add(ctx, lprobs, ggml_repeat(ctx, last_scores, lprobs));
     }
     }
 
 
     ggml_cgraph gf = ggml_build_forward(lprobs);
     ggml_cgraph gf = ggml_build_forward(lprobs);
@@ -650,6 +650,13 @@ int StandardBeamSearch_step(
     return topk;
     return topk;
 }
 }
 
 
+
+void ggml_detach(ggml_tensor* a) {
+    a->op = GGML_OP_NONE;
+    a->src[0] = nullptr;
+}
+
+
 int _finalize_hypothesis(
 int _finalize_hypothesis(
     const SequenceGeneratorJob& job,
     const SequenceGeneratorJob& job,
     ggml_context* ctx,
     ggml_context* ctx,
@@ -667,9 +674,6 @@ int _finalize_hypothesis(
     // Detect beams that reached the minimum length and that end with an EOS.
     // Detect beams that reached the minimum length and that end with an EOS.
     bool eos = token == job.eos_idx;
     bool eos = token == job.eos_idx;
     eos &= tok_score != -INFINITY;
     eos &= tok_score != -INFINITY;
-    // TODO ignored_beam_mask ?
-    // eos &= ggml_get_i32_1d(ignored_beam_mask, beam);
-    // ggml_set_i32_1d(eos_mask, beam, eos);
 
 
     if (!eos) return 0;
     if (!eos) return 0;
 
 
@@ -696,18 +700,20 @@ int _finalize_hypothesis(
         // Skip first EOS since it is always 0 and skews normalization.
         // Skip first EOS since it is always 0 and skews normalization.
         tok_score /= (float)std::pow((step_nr + 1), job.opts.len_penalty);
         tok_score /= (float)std::pow((step_nr + 1), job.opts.len_penalty);
 
 
+    // TODO the score computed here isn't the same than computed by fairseq2.
     hypotheses.emplace_back(Hypothesis{tokens, tok_score, step_scores});
     hypotheses.emplace_back(Hypothesis{tokens, tok_score, step_scores});
     return 1;
     return 1;
 }
 }
 
 
 /// Generates a translation for a single sequence
 /// Generates a translation for a single sequence
 // TODO: finish this for beam_size=1
 // TODO: finish this for beam_size=1
-// * implement the lprobs tweaking
+// * find out why score is different (seq is the same though)
 // TODO: add IncrementalStateBag support to avoid a O(N^3) generation.
 // TODO: add IncrementalStateBag support to avoid a O(N^3) generation.
 // TODO: support beam_size > 1:
 // TODO: support beam_size > 1:
 // * most layers assume un-batched input, but we want to handle several beams at once
 // * most layers assume un-batched input, but we want to handle several beams at once
 // * need to port "reorder_state_dict"
 // * need to port "reorder_state_dict"
-// * once beam are selected with topk, we need to update seqs and scores tensors
+// TODO: clean up
+// * replace manual tensor tweaking with ggml_set_*d (ggml_set_slice could be useful)
 extern "C" float generate_sequence(
 extern "C" float generate_sequence(
     fairseq2_model& model,
     fairseq2_model& model,
     const SequenceGeneratorJob& job,
     const SequenceGeneratorJob& job,
@@ -805,6 +811,7 @@ extern "C" float generate_sequence(
         ggml_cgraph gf = ggml_build_forward(lprobs);
         ggml_cgraph gf = ggml_build_forward(lprobs);
         printf("beam search step %d. Graph.n_nodes: %d\n", step_nr, gf.n_nodes);
         printf("beam search step %d. Graph.n_nodes: %d\n", step_nr, gf.n_nodes);
         ggml_graph_compute_with_ctx(ctx, &gf, 1);
         ggml_graph_compute_with_ctx(ctx, &gf, 1);
+        ggml_detach(lprobs);
 
 
         // // Do not allow EOS before reaching the minimum sequence length.
         // // Do not allow EOS before reaching the minimum sequence length.
         if (step_nr < job.opts.min_seq_len) {
         if (step_nr < job.opts.min_seq_len) {
@@ -814,7 +821,7 @@ extern "C" float generate_sequence(
         }
         }
 
 
         // If we have reached the maximum length, force the last step to be EOS.
         // If we have reached the maximum length, force the last step to be EOS.
-        // TODO: should this be done in an hadoc loop ? how often does that happen anyway ?
+        // TODO: should this be done in an adhoc loop ? how often does that happen anyway ?
         if (step_nr == max_seq_len - 2) {
         if (step_nr == max_seq_len - 2) {
             // lprobs[:, :, : self.eos_idx]       = -torch.inf
             // lprobs[:, :, : self.eos_idx]       = -torch.inf
             // lprobs[:, :,   self.eos_idx + 1 :] = -torch.inf
             // lprobs[:, :,   self.eos_idx + 1 :] = -torch.inf
@@ -832,9 +839,14 @@ extern "C" float generate_sequence(
         for (size_t i = 0; i < beam_size; ++i)
         for (size_t i = 0; i < beam_size; ++i)
             ggml_set_f32_1d(lprobs, vocab_size * i + pad_idx, -INFINITY);
             ggml_set_f32_1d(lprobs, vocab_size * i + pad_idx, -INFINITY);
 
 
-        // // Apply UNK penalty.
-        // if self.unk_idx is not None:
-        //     lprobs[:, :, self.unk_idx] -= self.opts.unk_penalty
+        // Apply UNK penalty.
+        if (job.unk_idx >= 0 && job.opts.unk_penalty != 0) {
+            // lprobs[:, :, self.unk_idx] -= self.opts.unk_penalty
+            auto lprobs_raw = ggml_get_data_f32(lprobs);
+            for (size_t i = 0; i < beam_size; ++i)
+                lprobs_raw[vocab_size * i + job.unk_idx] -= job.opts.unk_penalty;
+        }
+
 
 
         // Determine candidates for the next step.
         // Determine candidates for the next step.
         // (N, 2 x B)
         // (N, 2 x B)
@@ -871,24 +883,24 @@ extern "C" float generate_sequence(
         // Reorder beams in the `seq` and `score` buffers. The same beam can
         // Reorder beams in the `seq` and `score` buffers. The same beam can
         // be selected more than once.
         // be selected more than once.
         ggml_tensor* new_seqs = seqs;
         ggml_tensor* new_seqs = seqs;
+        // ggml_get_rows and ggml_set only work with floats ...
+        new_seqs->type = GGML_TYPE_F32;
         ggml_tensor* new_scores = scores;
         ggml_tensor* new_scores = scores;
         if (step_nr > start_step) {
         if (step_nr > start_step) {
             // (B, S), (B) -> (B, S)
             // (B, S), (B) -> (B, S)
-            // ggml_get_rows only work with floats ...
-            new_seqs->type = GGML_TYPE_F32;
             new_seqs = ggml_get_rows(ctx, seqs, beam_indices);
             new_seqs = ggml_get_rows(ctx, seqs, beam_indices);
             new_scores = ggml_get_rows(ctx, new_scores, beam_indices);
             new_scores = ggml_get_rows(ctx, new_scores, beam_indices);
         }
         }
 
 
         // new_seqs[:, step_nr + 1] = next_tokens
         // new_seqs[:, step_nr + 1] = next_tokens
-        ggml_set_1d_inplace(ctx, new_seqs, next_tokens, new_seqs->nb[0] * (step_nr + 1));
-        ggml_set_1d_inplace(ctx, new_scores, next_scores, new_scores->nb[0] * (step_nr + 1));
-
-        gf = ggml_build_forward(new_seqs);
+        gf = ggml_build_forward(ggml_set_1d_inplace(ctx, new_seqs, next_tokens, new_seqs->nb[0] * (step_nr + 1)));
         ggml_graph_compute_with_ctx(ctx, &gf, 1);
         ggml_graph_compute_with_ctx(ctx, &gf, 1);
+        ggml_detach(new_seqs);
         new_seqs->type = GGML_TYPE_I32;
         new_seqs->type = GGML_TYPE_I32;
-        gf = ggml_build_forward(new_scores);
+
+        gf = ggml_build_forward(ggml_set_1d_inplace(ctx, new_scores, next_scores, new_scores->nb[0] * (step_nr + 1)));
         ggml_graph_compute_with_ctx(ctx, &gf, 1);
         ggml_graph_compute_with_ctx(ctx, &gf, 1);
+        ggml_detach(new_scores);
 
 
         // TODO the old seqs and score buffers could be reused for next step
         // TODO the old seqs and score buffers could be reused for next step
         seqs = new_seqs;
         seqs = new_seqs;

+ 3 - 1
ggml/examples/unity/fairseq2.h

@@ -118,8 +118,10 @@ struct SequenceGeneratorOptions {
 struct SequenceGeneratorJob {
 struct SequenceGeneratorJob {
     SequenceGeneratorOptions opts;
     SequenceGeneratorOptions opts;
     ggml_tensor* prefix_seq;
     ggml_tensor* prefix_seq;
-    std::int32_t eos_idx;
     std::int32_t pad_idx;
     std::int32_t pad_idx;
+    std::int32_t unk_idx;
+    std::int32_t bos_idx;
+    std::int32_t eos_idx;
 };
 };
 
 
 
 

+ 3 - 1
ggml/ggml.py

@@ -318,8 +318,10 @@ class SequenceGeneratorOptions:
 class SequenceGeneratorJob:
 class SequenceGeneratorJob:
     opts: SequenceGeneratorOptions
     opts: SequenceGeneratorOptions
     prefix_seq: Ptr[ggml_tensor]
     prefix_seq: Ptr[ggml_tensor]
-    eos_idx: int
     pad_idx: int
     pad_idx: int
+    unk_idx: int
+    bos_idx: int
+    eos_idx: int
 
 
 
 
 @c_fn(lib)
 @c_fn(lib)

+ 3 - 1
ggml/test_unity_cpp.py

@@ -686,8 +686,10 @@ def test_t2tt(ctx: Ctx, g_model: c_void_p):
     job.opts.len_penalty = 1.0
     job.opts.len_penalty = 1.0
     job.opts.unk_penalty = 0.0
     job.opts.unk_penalty = 0.0
     job.prefix_seq = ggml.from_numpy(ctx, text_out["tgt_tokens"].astype(np.int32)[:2])
     job.prefix_seq = ggml.from_numpy(ctx, text_out["tgt_tokens"].astype(np.int32)[:2])
+    job.pad_idx = 0
+    job.unk_idx = 1
+    job.bos_idx = 2
     job.eos_idx = 3
     job.eos_idx = 3
-    job.pad_idx = 1
 
 
     result = ggml.ggml_tensor()
     result = ggml.ggml_tensor()
     g_score = ggml.generate_sequence(
     g_score = ggml.generate_sequence(