1 жил өмнө · 6fbb465f2b
--- a/ggml/ctypes_utils.py
+++ b/ggml/ctypes_utils.py
@@ -12,12 +12,13 @@ class Ptr(Generic[T]):
 
				     contents: T
			
 
				 
			
 
				     def __new__(cls):
			
 
				+        breakpoint()
			
 
				         return ctypes.pointer()
			
 
				 
			
 
				 
			
 
				 def c_struct(cls):
			
 
				     struct = types.new_class(cls.__name__, bases=(ctypes.Structure,))
			
 
				-    struct.__module__ = "ctypes"
			
 
				+    struct.__module__ = cls.__module__
			
 
				     struct._fields_ = [
			
 
				         (k, _py_type_to_ctype(v)) for k, v in cls.__annotations__.items()
			
 
				     ]
			
@@ -33,8 +34,11 @@ def _py_type_to_ctype(t: type):
 
				         )
			
 
				     if t.__module__ == "ctypes":
			
 
				         return t
			
 
				-    if isinstance(t, type) and issubclass(t, ctypes.Structure):
			
 
				-        return t
			
 
				+    if isinstance(t, type):
			
 
				+        if issubclass(t, ctypes.Structure):
			
 
				+            return t
			
 
				+        if issubclass(t, ctypes._Pointer):
			
 
				+            return t
			
 
				     if t is int:
			
 
				         return ctypes.c_int
			
 
				     if t is float:
			
@@ -66,7 +70,8 @@ def _c_fn(module, fn):
 
				 
			
 
				     @functools.wraps(fn)
			
 
				     def actual_fn(*args, **kwargs):
			
 
				-        return c_fn(*args, **kwargs)
			
 
				+        raw_res = c_fn(*args, **kwargs)
			
 
				+        return raw_res
			
 
				 
			
 
				     return actual_fn
			
 
				 
			
--- a/ggml/examples/unity/fairseq2.cpp
+++ b/ggml/examples/unity/fairseq2.cpp
@@ -617,7 +617,8 @@ void _bootstrap_seqs_and_scores(
 
				         seqs,
			
 
				         /*padding_mask*/ nullptr,
			
 
				         encoder_output,
			
 
				-        /*encoder_padding_mask*/ nullptr // TODO: do we need padding for encoder ?
			
 
				+        // we assume there is only one input, and therefore we don't need padding.
			
 
				+        /*encoder_padding_mask*/ nullptr
			
 
				         // TODO: state_bag
			
 
				     );
			
 
				     // TODO state_bag.increment_step(prefix_seq_len - 1)
			
@@ -645,22 +646,8 @@ void _bootstrap_seqs_and_scores(
 
				     }
			
 
				 }
			
 
				 
			
 
				-/// Represents a hypothesis produced by a sequence generator.
			
 
				-struct Hypothesis {
			
 
				-    /// The generated sequence.
			
 
				-    ggml_tensor* seq;
			
 
				-
			
 
				-    /// The score of the hypothesis.
			
 
				-    float score;
			
 
				-
			
 
				-    /// The score of each individual sequence step.
			
 
				-    ggml_tensor* step_scores;
			
 
				-};
			
 
				-
			
 
				-
			
 
				-/// Finds the topk indices
			
 
				+/// Finds the topk indices, and write the winning indices in "candidate_indices" array.
			
 
				 int topk(
			
 
				-    ggml_context* ctx,
			
 
				     ggml_tensor* lprobs,  // (B, V)
			
 
				     std::int64_t k,
			
 
				     ggml_tensor* candidate_indices
			
@@ -687,6 +674,7 @@ void ggml_detach(ggml_tensor* a) {
 
				 }
			
 
				 
			
 
				 
			
 
				+/// Copies the sequence and scores of a given candidate beam.
			
 
				 void _finalize_hypothesis(
			
 
				     const SequenceGeneratorJob& job,
			
 
				     ggml_context* ctx,
			
@@ -698,7 +686,6 @@ void _finalize_hypothesis(
 
				     ggml_tensor* scores, // (beam_size, seq_len)
			
 
				     std::vector<Hypothesis>& hypotheses
			
 
				 ) {
			
 
				-    // If the candidate beam is "finished", let's copy the score and sequence
			
 
				     ggml_tensor* tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, step_nr + 2);
			
 
				     ggml_tensor* step_scores = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, step_nr + 2);
			
 
				 
			
@@ -711,12 +698,12 @@ void _finalize_hypothesis(
 
				     // Convert from cumulative to per-step scores.
			
 
				     auto sc = (float*)step_scores->data;
			
 
				     float last_score = eos_score;
			
 
				-    sc[step_nr + 1] = last_score;
			
 
				     for (int i = step_nr; i >= 0; --i) {
			
 
				         float sc0 = ggml_get_f32_1d(scores, scores->ne[0] * beam + i);
			
 
				-        sc[i] = last_score - sc0;
			
 
				+        sc[i + 1] = last_score - sc0;
			
 
				         last_score = sc0;
			
 
				     }
			
 
				+    sc[0] = 0;
			
 
				 
			
 
				     if (job.opts.normalize_scores)
			
 
				         // Skip first EOS since it is always 0 and skews normalization.
			
@@ -725,21 +712,21 @@ void _finalize_hypothesis(
 
				     hypotheses.emplace_back(Hypothesis{tokens, eos_score, step_scores});
			
 
				 }
			
 
				 
			
 
				+// Uses ggml_context to store any object.
			
 
				+#define GGML_CTX_ALLOC(ctx, Type, n) \
			
 
				+    (Type*)(ggml_new_tensor_1d(ctx, GGML_TYPE_I8, sizeof(Type) * n)->data);
			
 
				+
			
 
				+
			
 
				 /// Generates a translation for a single sequence
			
 
				-// TODO: finish this for beam_size=1
			
 
				-// * find out why score is different (seq is the same though)
			
 
				 // TODO: add IncrementalStateBag support to avoid a O(N^3) generation.
			
 
				-// TODO: support beam_size > 1:
			
 
				-// * most layers assume un-batched input, but we want to handle several beams at once
			
 
				-// * need to port "reorder_state_dict"
			
 
				-// TODO: clean up
			
 
				-// * replace manual tensor tweaking with ggml_set_*d (ggml_set_slice could be useful)
			
 
				-extern "C" float generate_sequence(
			
 
				+// TODO: clean ups
			
 
				+// * replace manual tensor tweaking with ggml_set_*d (a ggml_set_slice could be useful)
			
 
				+extern "C" Hypothesis* generate_sequence(
			
 
				     fairseq2_model& model,
			
 
				     const SequenceGeneratorJob& job,
			
 
				     ggml_tensor* encoder_output,
			
 
				     ggml_tensor* encoder_padding_mask,
			
 
				-    ggml_tensor* output_seq
			
 
				+    ggml_context* result_ctx
			
 
				 ) {
			
 
				     ggml_context* ctx = model.ctx;
			
 
				     size_t eos_idx = job.eos_idx;
			
@@ -787,25 +774,6 @@ extern "C" float generate_sequence(
 
				     // there should be a per-step ggml_context for intermediary results
			
 
				     // start of beam search:
			
 
				     for (int step_nr = start_step; step_nr < max_seq_len - 1; ++step_nr) {
			
 
				-        // if (beam_indices != nullptr) {
			
 
				-        //     // If not `None`, it means in the last step we finalized one or
			
 
				-        //     // more searches. We should ensure that we adjust `beam_indices`
			
 
				-        //     // before reordering `decoder`'s incremental state.
			
 
				-        //     if (search_indices != nullptr) {
			
 
				-        //         num_searches = search_indices->ne[0];
			
 
				-
			
 
				-        //         // (N)
			
 
				-        //         delta = search_indices - torch.arange(num_searches, device=device)
			
 
				-
			
 
				-        //         // (N) -> (N, 1)
			
 
				-        //         delta.unsqueeze_(-1)
			
 
				-
			
 
				-        //         // Adjust indices to take into account removed searches.
			
 
				-        //         beam_indices.view(num_searches, beam_size).add_(delta * beam_size)
			
 
				-        //     }
			
 
				-
			
 
				-        //     // state_bag.reorder(beam_indices)
			
 
				-        // }
			
 
				         // because of no IncrementalStateBag we pass input from the start
			
 
				         // decoder_input = seqs[:, 0 : step_nr + 1]
			
 
				         ggml_tensor* decoder_input = ggml_slice(ctx, seqs, 0, 0, step_nr + 1);
			
@@ -845,7 +813,6 @@ extern "C" float generate_sequence(
 
				         }
			
 
				 
			
 
				         // If we have reached the maximum length, force the last step to be EOS.
			
 
				-        // TODO: should this be done in an adhoc loop ? how often does that happen anyway ?
			
 
				         if (step_nr == max_seq_len - 2) {
			
 
				             // lprobs[:, :, : self.eos_idx]       = -torch.inf
			
 
				             // lprobs[:, :,   self.eos_idx + 1 :] = -torch.inf
			
@@ -856,7 +823,6 @@ extern "C" float generate_sequence(
 
				                 for (t = eos_idx + 1; t < vocab_size; ++t)
			
 
				                     ggml_set_f32_1d(lprobs, vocab_size * b + t, -INFINITY);
			
 
				             }
			
 
				-
			
 
				         }
			
 
				 
			
 
				         // Never allow PAD.
			
@@ -890,10 +856,10 @@ extern "C" float generate_sequence(
 
				         gf = ggml_build_forward(lprobs);
			
 
				         ggml_graph_compute_with_ctx(ctx, &gf, 1);
			
 
				 
			
 
				-        // Determine candidates for the next step.
			
 
				+        // Determine (beam, token) candidates for the next step.
			
 
				         // (N, 2 x B)
			
 
				         std::int64_t K = topk(
			
 
				-            ctx, lprobs, std::min(2 * beam_size, vocab_size - 1), candidate_indices
			
 
				+            lprobs, std::min(2 * beam_size, vocab_size - 1), candidate_indices
			
 
				         );
			
 
				 
			
 
				         std::size_t ongoing_beams = 0;
			
@@ -907,7 +873,7 @@ extern "C" float generate_sequence(
 
				             bool eos = token == job.eos_idx;
			
 
				             eos &= tok_score != -INFINITY;
			
 
				             if (eos) {
			
 
				-                _finalize_hypothesis(job, ctx, step_nr, beam, token, tok_score, seqs, scores, finished_searches);
			
 
				+                _finalize_hypothesis(job, result_ctx, step_nr, beam, token, tok_score, seqs, scores, finished_searches);
			
 
				                 if (finished_searches.size() >= beam_size)
			
 
				                     goto end_of_beam_search;
			
 
				                 continue;
			
@@ -960,9 +926,25 @@ end_of_beam_search:
 
				         [](Hypothesis a, Hypothesis b) { return a.score > b.score; }
			
 
				     );
			
 
				 
			
 
				-    // For now just return the best sequence
			
 
				-    // TODO: return structured output
			
 
				-    *output_seq = *(finished_searches[0].seq);
			
 
				+    // Copy the scores to an object in the result_ctx.
			
 
				+    GGML_ASSERT(finished_searches.size() <= beam_size);
			
 
				+    Hypothesis* result = GGML_CTX_ALLOC(result_ctx, struct Hypothesis, beam_size);
			
 
				+    std::copy(finished_searches.begin(), finished_searches.end(), result);
			
 
				+    // In case we have less searches than expected, still make sure to initialize the memory.
			
 
				+    for (std::size_t i = finished_searches.size(); i < beam_size; ++i)
			
 
				+        result[i] = Hypothesis{nullptr, -INFINITY, nullptr};
			
 
				+
			
 
				+    return result;
			
 
				+}
			
 
				+
			
 
				+extern "C" Hypothesis* _testing_return_hypothesis_ptr(ggml_context* ctx) {
			
 
				+    Hypothesis* result = GGML_CTX_ALLOC(ctx, struct Hypothesis, 2);
			
 
				+
			
 
				+    result[0] = {ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1), 3.14f, (ggml_tensor*)result};
			
 
				+    ggml_set_i32_1d(result[0].seq, 0, 314);
			
 
				 
			
 
				-    return finished_searches[0].score;
			
 
				+    result[1] = {ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1), 4.21f, nullptr};
			
 
				+    ggml_set_i32_1d(result[1].seq, 0, 421);
			
 
				+
			
 
				+    return result;
			
 
				 }
			
--- a/ggml/examples/unity/fairseq2.h
+++ b/ggml/examples/unity/fairseq2.h
@@ -142,11 +142,23 @@ struct SequenceGeneratorJob {
 
				     std::int32_t eos_idx;
			
 
				 };
			
 
				 
			
 
				+/// Represents a hypothesis produced by a sequence generator.
			
 
				+struct Hypothesis {
			
 
				+    /// The generated sequence.
			
 
				+    ggml_tensor* seq;
			
 
				 
			
 
				-extern "C" float generate_sequence(
			
 
				+    /// The score of the hypothesis.
			
 
				+    float score;
			
 
				+
			
 
				+    /// The score of each individual sequence step.
			
 
				+    ggml_tensor* step_scores;
			
 
				+};
			
 
				+
			
 
				+
			
 
				+extern "C" Hypothesis* generate_sequence(
			
 
				     fairseq2_model& model,
			
 
				     const SequenceGeneratorJob& opts,
			
 
				     ggml_tensor* encoder_output,
			
 
				     ggml_tensor* encoder_padding_mask,
			
 
				-    ggml_tensor* output_seq
			
 
				+    ggml_context* result_ctx
			
 
				 );
			
--- a/ggml/ggml.py
+++ b/ggml/ggml.py
@@ -424,12 +424,28 @@ class SequenceGeneratorJob:
 
				     eos_idx: int
			
 
				 
			
 
				 
			
 
				+@c_struct
			
 
				+class Hypothesis:
			
 
				+    seq: Ptr[ggml_tensor]
			
 
				+    """The generated sequence."""
			
 
				+
			
 
				+    score: float
			
 
				+    """The score of the hypothesis."""
			
 
				+
			
 
				+    step_scores: Ptr[ggml_tensor]
			
 
				+    """The score of each individual sequence step."""
			
 
				+
			
 
				+
			
 
				 @c_fn(lib)
			
 
				 def generate_sequence(
			
 
				     model: ctypes.c_void_p,
			
 
				     job: Ptr[SequenceGeneratorJob],
			
 
				     encoder_output: Ptr[ggml_tensor],
			
 
				     encoder_padding_mask: Ptr[ggml_tensor],
			
 
				-    output_seq: Ptr[ggml_tensor],
			
 
				-) -> float:
			
 
				+    result_ctx: ggml_context_p,
			
 
				+) -> Ptr[Hypothesis]:
			
 
				     ...
			
 
				+
			
 
				+@c_fn(lib)
			
 
				+def _testing_return_hypothesis_ptr(ctx: ggml_context_p) -> Ptr[Hypothesis]:
			
 
				+    return Ptr()
			
--- a/ggml/test_ggml_integration.py
+++ b/ggml/test_ggml_integration.py
@@ -23,12 +23,12 @@ from seamless_communication.models.inference.translator import Translator, Modal
 
				 Ctx = ggml.ggml_context_p
			
 
				 
			
 
				 UNITY_MODELS = Path(__file__).parent / "examples/unity/models"
			
 
				-CTX_PARAMS = ggml.ggml_init_params(mem_size=1024 * 1024 * 1024, mem_buffer=None)
			
 
				+CTX_PARAMS = ggml.ggml_init_params(mem_size=16 * 1024 * 1024, mem_buffer=None)
			
 
				 
			
 
				 
			
 
				 @pytest.fixture(name="ctx")
			
 
				 def _ctx() -> Iterator[Ctx]:
			
 
				-    """Allocate a new context with 1024 MB of memory"""
			
 
				+    """Allocate a new context with 16 MB of memory"""
			
 
				     try:
			
 
				         ctx = ggml.ggml_init(params=CTX_PARAMS)
			
 
				         yield ctx
			
@@ -353,3 +353,14 @@ def test_ggml_softmax_vs_torch(ctx: Ctx, shape: Tuple[int, ...]) -> None:
 
				     y = ggml.to_numpy(gy)
			
 
				     assert np.allclose(y_exp, y, rtol=1e-3)
			
 
				     assert np.allclose(np.argmax(y_exp, axis=-1), np.argmax(y, axis=-1))
			
 
				+
			
 
				+
			
 
				+def test_can_return_hypothesis_ptr(ctx: Ctx) -> None:
			
 
				+    hyp_ptr = ggml._testing_return_hypothesis_ptr(ctx)
			
 
				+
			
 
				+    hyp0, hyp1 = hyp_ptr[0], hyp_ptr[1]
			
 
				+    assert ggml.to_numpy(hyp0.seq).tolist() == [314]
			
 
				+    assert hyp0.score == pytest.approx(3.14)
			
 
				+
			
 
				+    assert ggml.to_numpy(hyp1.seq).tolist() == [421]
			
 
				+    assert hyp1.score == pytest.approx(4.21)
			
--- a/ggml/test_unity_cpp.py
+++ b/ggml/test_unity_cpp.py
@@ -8,6 +8,7 @@ import fairseq2.nn
 
				 import fairseq2.nn.transformer
			
 
				 import logging
			
 
				 import sys
			
 
				+import functools
			
 
				 from pathlib import Path
			
 
				 from ctypes_utils import Ptr
			
 
				 from ctypes import c_void_p
			
@@ -32,40 +33,34 @@ def _ctx() -> Iterator[Ctx]:
 
				     """Allocate a new context with 1024 MB of memory"""
			
 
				     try:
			
 
				         ctx = ggml.ggml_init(params=CTX_PARAMS)
			
 
				-        yield ctx
			
 
				+        with torch.inference_mode():
			
 
				+            yield ctx
			
 
				     finally:
			
 
				         ggml.ggml_free(ctx)
			
 
				 
			
 
				 
			
 
				-@pytest.fixture(scope="module")
			
 
				-def g_model_once() -> Iterator[c_void_p]:
			
 
				+@functools.lru_cache()
			
 
				+def _load_g_model_once() -> NativeObj:
			
 
				     model_file = Path(__file__).parent / "seamlessM4T_medium.ggml"
			
 
				     if not model_file.exists():
			
 
				         convert_model("seamlessM4T_medium", model_file)
			
 
				-    with ggml.load_unity_ggml_file(model_file) as model:
			
 
				-        yield model
			
 
				-
			
 
				+    return ggml.load_unity_ggml_file(model_file)
			
 
				 
			
 
				 @pytest.fixture()
			
 
				-def g_model(ctx: Ctx, g_model_once: c_void_p) -> c_void_p:
			
 
				-    ggml.lib.fairseq2_model_set_inference_ctx(g_model_once, ctx)
			
 
				-    return g_model_once
			
 
				+def g_model(ctx: Ctx) -> c_void_p:
			
 
				+    model = _load_g_model_once()
			
 
				+    ggml.lib.fairseq2_model_set_inference_ctx(model.ptr, ctx)
			
 
				+    return model.ptr
			
 
				 
			
 
				 
			
 
				-@pytest.fixture(scope="module")
			
 
				-def translator() -> Iterator[Any]:
			
 
				-    tr = Translator(
			
 
				+@functools.lru_cache(maxsize=1)
			
 
				+def load_translator() -> Translator:
			
 
				+    return Translator(
			
 
				         "seamlessM4T_medium", "vocoder_36langs", torch.device("cpu"), torch.float32
			
 
				     )
			
 
				-    with torch.inference_mode():
			
 
				-        yield tr
			
 
				-
			
 
				 
			
 
				-@pytest.fixture(scope="module")
			
 
				-def pt_model(translator: Translator) -> Any:
			
 
				-    model = translator.model
			
 
				-    print(model)
			
 
				-    return model
			
 
				+def load_pt_model() -> Any:
			
 
				+    return load_translator().model
			
 
				 
			
 
				 
			
 
				 @pytest.mark.xfail(reason="TODO")
			
@@ -108,10 +103,11 @@ def test_causal_attention_mask(ctx: Ctx):
 
				     assert np.all(mask == mask_exp)
			
 
				 
			
 
				 
			
 
				-def test_LayerNorm_forward(ctx: Ctx, g_model: c_void_p, pt_model: Any) -> None:
			
 
				+def test_LayerNorm_forward(ctx: Ctx, g_model: c_void_p) -> None:
			
 
				     x = torch.empty((2, 21, 1024))
			
 
				     torch.nn.init.uniform_(x, -1, 1)
			
 
				 
			
 
				+    pt_model = load_pt_model()
			
 
				     y_exp = pt_model.text_encoder.layers[0].ffn_layer_norm(x).numpy()
			
 
				     gx = ggml.from_numpy(ctx, x)
			
 
				     gy = ggml.forward("LayerNorm", g_model, "text_encoder.layers.0.ffn_layer_norm", gx)
			
@@ -121,10 +117,11 @@ def test_LayerNorm_forward(ctx: Ctx, g_model: c_void_p, pt_model: Any) -> None:
 
				     assert np.allclose(y_exp, y, atol=1e-5)
			
 
				 
			
 
				 
			
 
				-def test_Linear_forward(ctx: Ctx, g_model: c_void_p, pt_model: Any) -> None:
			
 
				+def test_Linear_forward(ctx: Ctx, g_model: c_void_p) -> None:
			
 
				     x = torch.empty((2, 21, 1024))
			
 
				     torch.nn.init.uniform_(x, -1, 1)
			
 
				 
			
 
				+    pt_model = load_pt_model()
			
 
				     y_exp = pt_model.text_encoder.layers[0].ffn.inner_proj(x).numpy()
			
 
				     gx = ggml.from_numpy(ctx, x)
			
 
				     gy = ggml.forward("Linear", g_model, "text_encoder.layers.0.ffn.inner_proj", gx)
			
@@ -134,11 +131,12 @@ def test_Linear_forward(ctx: Ctx, g_model: c_void_p, pt_model: Any) -> None:
 
				     assert np.allclose(y_exp, y, atol=1e-5)
			
 
				 
			
 
				 
			
 
				-def test_FeedForwardNetwork_forward(ctx: Ctx, g_model: c_void_p, pt_model: Any) -> None:
			
 
				+def test_FeedForwardNetwork_forward(ctx: Ctx, g_model: c_void_p) -> None:
			
 
				     x = torch.empty((2, 21, 1024))  # (bs, seq_len, model_dim)
			
 
				     torch.nn.init.uniform_(x, -1 / 32, 1 / 32)
			
 
				 
			
 
				     # Test FFN without LayerNorm
			
 
				+    pt_model = load_pt_model()
			
 
				     y_exp = pt_model.text_encoder.layers[0].ffn(x).numpy()
			
 
				     gx = ggml.from_numpy(ctx, x)
			
 
				     gy = ggml.forward(
			
@@ -157,11 +155,12 @@ def _name(tensor: ggml.ggml_tensor_p) -> bytes:
 
				         return b"???"
			
 
				 
			
 
				 
			
 
				-def test_MultiheadAttention_forward(ctx: Ctx, g_model: c_void_p, pt_model: Any) -> None:
			
 
				+def test_MultiheadAttention_forward(ctx: Ctx, g_model: c_void_p) -> None:
			
 
				     x = torch.empty((2, 21, 1024))
			
 
				     torch.random.manual_seed(0)
			
 
				     torch.nn.init.uniform_(x, -1, 1)
			
 
				 
			
 
				+    pt_model = load_pt_model()
			
 
				     self_attn = pt_model.text_encoder.layers[0].self_attn
			
 
				 
			
 
				     # Note: we use different lengths for queries and keys,
			
@@ -222,13 +221,14 @@ def test_MultiheadAttention_forward(ctx: Ctx, g_model: c_void_p, pt_model: Any)
 
				 
			
 
				 
			
 
				 def test_StandardTransformerEncoderLayer_forward(
			
 
				-    ctx: Ctx, g_model: c_void_p, pt_model: Any
			
 
				+    ctx: Ctx, g_model: c_void_p
			
 
				 ) -> None:
			
 
				     x = torch.empty((2, 21, 1024))
			
 
				     padding_mask = torch.ones((2, 21))
			
 
				     torch.random.manual_seed(0)
			
 
				     torch.nn.init.uniform_(x, -1, 1)
			
 
				 
			
 
				+    pt_model = load_pt_model()
			
 
				     layer = pt_model.text_encoder.layers[0]
			
 
				 
			
 
				     gx = ggml.from_numpy(ctx, x)
			
@@ -255,7 +255,7 @@ def test_StandardTransformerEncoderLayer_forward(
 
				 
			
 
				 
			
 
				 def test_StandardTransformerEncoder_forward(
			
 
				-    ctx: Ctx, g_model: c_void_p, pt_model: Any
			
 
				+    ctx: Ctx, g_model: c_void_p
			
 
				 ) -> None:
			
 
				     x = torch.empty((2, 21, 1024))
			
 
				     padding_mask = torch.ones((2, 21))
			
@@ -278,6 +278,7 @@ def test_StandardTransformerEncoder_forward(
 
				 
			
 
				     y = ggml.to_numpy(gy)
			
 
				 
			
 
				+    pt_model = load_pt_model()
			
 
				     y_exp, _ = pt_model.text_encoder(x, padding_mask)
			
 
				     y_exp = y_exp.numpy()
			
 
				 
			
@@ -306,7 +307,7 @@ def test_PositionalEmbedding_forward(ctx: Ctx, g_model: c_void_p) -> None:
 
				 
			
 
				 
			
 
				 def test_TransformerEmbeddingFrontend_forward(
			
 
				-    ctx: Ctx, g_model: c_void_p, pt_model: Any
			
 
				+    ctx: Ctx, g_model: c_void_p
			
 
				 ) -> None:
			
 
				     seq = torch.arange(2 * 20).reshape(2, 20)
			
 
				     seq[1, 15:] = 0  # padding for second sentence
			
@@ -320,6 +321,7 @@ def test_TransformerEmbeddingFrontend_forward(
 
				     ggml.build_and_compute(ctx, gy)
			
 
				     y = ggml.to_numpy(gy)
			
 
				 
			
 
				+    pt_model = load_pt_model()
			
 
				     y_exp, _ = pt_model.text_decoder_frontend(seq, seq_len)
			
 
				     y_exp = y_exp.numpy()
			
 
				 
			
@@ -328,7 +330,7 @@ def test_TransformerEmbeddingFrontend_forward(
 
				 
			
 
				 
			
 
				 def test_StandardTransformerDecoder_forward(
			
 
				-    ctx: Ctx, g_model: c_void_p, pt_model: Any
			
 
				+    ctx: Ctx, g_model: c_void_p
			
 
				 ) -> None:
			
 
				     x = torch.empty((2, 13, 1024))
			
 
				     encoder_out = torch.empty((2, 21, 1024))
			
@@ -353,6 +355,7 @@ def test_StandardTransformerDecoder_forward(
 
				     ggml.build_and_compute(ctx, gy)
			
 
				     y = ggml.to_numpy(gy)
			
 
				 
			
 
				+    pt_model = load_pt_model()
			
 
				     y_exp, _ = pt_model.text_decoder(x, padding_mask, encoder_out, None)
			
 
				     y_exp = y_exp.numpy()
			
 
				 
			
@@ -361,64 +364,82 @@ def test_StandardTransformerDecoder_forward(
 
				 
			
 
				 
			
 
				 def test_t2tt(ctx: Ctx, g_model: c_void_p):
			
 
				-    # def test_t2tt(ctx: Ctx, g_model: c_void_p, translator):
			
 
				-    # device = translator.device
			
 
				     src_lang = "eng"
			
 
				     src_text = "We are all in a yellow submarine."
			
 
				     tgt_lang = "fra"
			
 
				-    # token_encoder = translator.text_tokenizer.create_encoder(
			
 
				-    #     task="translation", lang=src_lang, mode="source", device=device
			
 
				-    # )
			
 
				-    # src = translator.collate(token_encoder(src_text))
			
 
				-
			
 
				-    # text_out, _ = translator.get_prediction(
			
 
				-    #     translator.model,
			
 
				-    #     translator.text_tokenizer,
			
 
				-    #     translator.unit_tokenizer,
			
 
				-    #     src,
			
 
				-    #     input_modality=Modality.TEXT,
			
 
				-    #     output_modality=Modality.TEXT,
			
 
				-    #     tgt_lang=tgt_lang,
			
 
				-    # )
			
 
				-
			
 
				-    # tgt_text = str(text_out.sentences[0])
			
 
				-    # assert tgt_text == "Nous sommes tous dans un sous-marin jaune."
			
 
				-    # tgt_tokens = text_out.generator_output.results[0][0].seq
			
 
				-    # score = text_out.generator_output.results[0][0].score.item()
			
 
				-    # np.savez(
			
 
				-    #     Path(__file__).parent / "sample_input.npz",
			
 
				-    #     score=score,
			
 
				-    #     encoder_output=text_out.encoder_output.squeeze(0).numpy(),
			
 
				-    #     encoder_padding_mask=text_out.encoder_padding_mask.squeeze(0).numpy(),
			
 
				-    #     tgt_tokens=tgt_tokens.numpy(),
			
 
				-    # )
			
 
				-
			
 
				-    text_out = np.load(Path(__file__).parent / "sample_input.npz")
			
 
				-    score = text_out["score"].item()
			
 
				-
			
 
				-    tgt_tokens = list(text_out["tgt_tokens"])
			
 
				+    sample_file = Path(__file__).parent / "sample_input.npz"
			
 
				+    beam_size = 2
			
 
				+
			
 
				+    if not sample_file.exists():
			
 
				+        translator = load_translator()
			
 
				+        device = translator.device
			
 
				+        token_encoder = translator.text_tokenizer.create_encoder(
			
 
				+            task="translation", lang=src_lang, mode="source", device=device
			
 
				+        )
			
 
				+        src = translator.collate(token_encoder(src_text))
			
 
				+
			
 
				+        text_out, _ = translator.get_prediction(
			
 
				+            translator.model,
			
 
				+            translator.text_tokenizer,
			
 
				+            translator.unit_tokenizer,
			
 
				+            src,
			
 
				+            input_modality=Modality.TEXT,
			
 
				+            output_modality=Modality.TEXT,
			
 
				+            tgt_lang=tgt_lang,
			
 
				+            beam_size=beam_size,
			
 
				+        )
			
 
				+
			
 
				+        tgt_text = str(text_out.sentences[0])
			
 
				+        assert tgt_text == "Nous sommes tous dans un sous-marin jaune."
			
 
				+        hypotheses = [
			
 
				+            {
			
 
				+                "seq": h.seq.tolist(),
			
 
				+                "score": h.score.item(),
			
 
				+                "step_scores": h.step_scores.numpy(),
			
 
				+            }
			
 
				+            for h in text_out.generator_output.results[0]
			
 
				+        ]
			
 
				+        np.savez(
			
 
				+            sample_file,
			
 
				+            encoder_output=text_out.encoder_output.numpy(),
			
 
				+            encoder_padding_mask=text_out.encoder_padding_mask.numpy(),
			
 
				+            hypotheses=hypotheses,
			
 
				+        )
			
 
				+
			
 
				+    # allow_pickle to load the hyp dicts
			
 
				+    text_out = np.load(sample_file, allow_pickle=True)
			
 
				     encoder_out = ggml.from_numpy(ctx, text_out["encoder_output"])
			
 
				     encoder_padding_mask = ggml.from_numpy(ctx, text_out["encoder_padding_mask"])
			
 
				+    prefix_seq = np.array(text_out["hypotheses"][0]["seq"][:2]).astype(np.int32)
			
 
				+    max_seq_len = max(len(h["seq"]) for h in text_out["hypotheses"])
			
 
				 
			
 
				     job = ggml.SequenceGeneratorJob()
			
 
				-    job.opts.beam_size = 2
			
 
				+    job.opts.beam_size = beam_size
			
 
				     job.opts.min_seq_len = 1
			
 
				     job.opts.soft_max_seq_len_a = 1
			
 
				     job.opts.soft_max_seq_len_b = 200
			
 
				-    job.opts.hard_max_seq_len = int(len(tgt_tokens) * 1.5)
			
 
				+    job.opts.hard_max_seq_len = int(max_seq_len * 1.5)
			
 
				     job.opts.len_penalty = 1.0
			
 
				     job.opts.unk_penalty = 0.0
			
 
				     job.opts.normalize_scores = True
			
 
				-    job.prefix_seq = ggml.from_numpy(ctx, text_out["tgt_tokens"].astype(np.int32)[:2])
			
 
				+
			
 
				+    job.prefix_seq = ggml.from_numpy(ctx, prefix_seq)
			
 
				     job.pad_idx = 0
			
 
				     job.unk_idx = 1
			
 
				     job.bos_idx = 2
			
 
				     job.eos_idx = 3
			
 
				 
			
 
				-    result = ggml.ggml_tensor()
			
 
				-    g_score = ggml.generate_sequence(
			
 
				-        g_model, job, encoder_out, encoder_padding_mask, ctypes.byref(result)
			
 
				+    result_ptr = ggml.generate_sequence(
			
 
				+        g_model, job, encoder_out, encoder_padding_mask, ctx
			
 
				     )
			
 
				-    tokens = list(ggml.to_numpy(ctypes.pointer(result)))
			
 
				-    assert tokens == tgt_tokens
			
 
				-    assert g_score == pytest.approx(score, rel=1e-2)
			
 
				+    results = [result_ptr[i] for i in range(beam_size)]
			
 
				+
			
 
				+    assert len(results) == len(text_out["hypotheses"])
			
 
				+    for g_hyp, exp in zip(results, text_out["hypotheses"]):
			
 
				+        g_tokens = list(ggml.to_numpy(g_hyp.seq))
			
 
				+        g_step_scores = ggml.to_numpy(g_hyp.step_scores)
			
 
				+        assert g_tokens == exp["seq"]
			
 
				+        assert g_hyp.score == pytest.approx(exp["score"], rel=1e-2)
			
 
				+        # The score error is big, this may negatively impact the beam search.
			
 
				+        assert np.allclose(g_step_scores, exp["step_scores"], atol=0.1)
			
 
				+