Explorar o código

SinusoidalPositionEncoder + WIP: TransformerEmbeddingFrontend

Guillaume Wenzek hai 1 ano
pai
achega
2238cea072
Modificáronse 5 ficheiros con 346 adicións e 30 borrados
  1. 92 15
      ggml/examples/unity/fairseq2.cpp
  2. 7 2
      ggml/examples/unity/fairseq2.h
  3. 40 3
      ggml/ggml.py
  4. 54 3
      ggml/ggml_convert.py
  5. 153 7
      ggml/test_unity_cpp.py

+ 92 - 15
ggml/examples/unity/fairseq2.cpp

@@ -59,7 +59,8 @@ extern "C" ggml_tensor* Linear_forward(
 extern "C" ggml_tensor* LayerNorm_forward(
 extern "C" ggml_tensor* LayerNorm_forward(
     fairseq2_model& model,
     fairseq2_model& model,
     const std::string &prefix,
     const std::string &prefix,
-    ggml_tensor* input) {
+    ggml_tensor* input
+) {
     ggml_tensor* weight = model.tensors[prefix + ".weight"];
     ggml_tensor* weight = model.tensors[prefix + ".weight"];
     GGML_ASSERT(weight != nullptr);
     GGML_ASSERT(weight != nullptr);
     ggml_tensor* bias = model.tensors[prefix + ".bias"];
     ggml_tensor* bias = model.tensors[prefix + ".bias"];
@@ -222,6 +223,74 @@ extern "C" ggml_tensor* StandardTransformerEncoderLayer_forward(
     return seqs;
     return seqs;
 }
 }
 
 
+struct ggml_tensor * ggml_slice(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int axis,
+        int64_t               start,
+        int64_t               end
+    ) {
+    int64_t ne[4];
+    std::copy(a->ne, a->ne + 4, ne);
+    if (start < 0) start = ne[axis] + start;
+    if (end < 0) end = ne[axis] + end;
+    GGML_ASSERT(0 <= start);
+    GGML_ASSERT(start <= end);
+    GGML_ASSERT(end <= ne[axis]);
+
+    ne[axis] = end - start;
+    size_t offset = a->nb[axis] * start;
+
+    size_t* nb = a->nb;
+    ggml_tensor* result = ggml_view_4d(ctx, a, ne[0], ne[1], ne[2], ne[3], nb[1], nb[2], nb[3], offset);
+    result->n_dims = a->n_dims;
+    return result;
+}
+
+
+extern "C" ggml_tensor* PositionalEmbedding_forward(
+    fairseq2_model& model,
+    const std::string& prefix,
+    ggml_tensor* embeds
+) {
+    int encoding_dim = embeds->ne[0];
+    int seq_len = embeds->ne[1];
+    ggml_tensor* full_pos_embeds = model.tensors[prefix];
+    ggml_tensor* pos_embeds = ggml_slice(model.ctx, full_pos_embeds, /*axis*/1, 0, seq_len);
+    return ggml_add(model.ctx, embeds, pos_embeds);
+}
+
+extern "C" ggml_tensor* TransformerEmbeddingFrontend_forward(
+    fairseq2_model& model,
+    const std::string& prefix,
+    ggml_tensor* seqs
+    // TODO: state_bag
+) {
+    ggml_context* ctx = model.ctx;
+    ggml_tensor* embed_weights = model.tensors[prefix + ".embed.weight"];
+    GGML_ASSERT(embed_weights != nullptr);
+    ggml_tensor* embeds = ggml_get_rows(ctx, embed_weights, seqs);
+
+    // padding_mask = to_padding_mask(embeds, seq_lens)
+
+    // TODO: scale when saving the model weights
+    // embeds = ggml_scale embeds * self.scale
+
+    if (has_layer(model, prefix + ".pos_encoder")) {
+        // This only work with the simple pos encoders
+        int encoding_dim = embeds->ne[0];
+        int seq_len = embeds->ne[1];
+       ggml_tensor* pos_embeds = ggml_view_2d(ctx, model.tensors[prefix + ".pos_encoder"], encoding_dim, seq_len, 0, 0);
+        embeds = ggml_add(ctx, embeds, pos_embeds);
+    }
+
+    if (has_layer(model, prefix + ".layer_norm")) {
+        embeds = LayerNorm_forward(model, prefix + ".layer_norm", embeds);
+    }
+
+    // padding mask ?
+    return embeds;
+}
 
 
 extern "C" ggml_tensor* StandardTransformerEncoder_forward(
 extern "C" ggml_tensor* StandardTransformerEncoder_forward(
     fairseq2_model& model,
     fairseq2_model& model,
@@ -389,13 +458,13 @@ extern "C" ggml_tensor* StandardTransformerDecoder_forward(
 using IncrementalStateBag = std::unordered_map<ggml_tensor*, ggml_tensor*>*;
 using IncrementalStateBag = std::unordered_map<ggml_tensor*, ggml_tensor*>*;
 
 
 
 
-int _determine_max_seq_len(const SequenceGeneratorJob& job) {
+int _determine_max_seq_len(const SequenceGeneratorJob& job, int source_seq_len) {
     auto opts = job.opts;
     auto opts = job.opts;
     int max_seq_len = -1;
     int max_seq_len = -1;
-    if (job.source_seq_len <= 0 || opts.soft_max_seq_len_a <= 0) {
+    if (source_seq_len <= 0 || opts.soft_max_seq_len_a <= 0) {
         max_seq_len = opts.hard_max_seq_len;
         max_seq_len = opts.hard_max_seq_len;
     } else {
     } else {
-        max_seq_len = std::min(opts.hard_max_seq_len, int(opts.soft_max_seq_len_a * job.source_seq_len + opts.soft_max_seq_len_b));
+        max_seq_len = std::min(opts.hard_max_seq_len, int(opts.soft_max_seq_len_a * source_seq_len + opts.soft_max_seq_len_b));
     }
     }
 
 
     if (opts.min_seq_len > max_seq_len) {
     if (opts.min_seq_len > max_seq_len) {
@@ -432,11 +501,12 @@ void _fan_out_encoder_output(
 
 
     // (B, S_enc, M)
     // (B, S_enc, M)
     ggml_tensor* shape = ggml_new_tensor_3d(ctx, GGML_TYPE_I8, encoder_output->ne[0], encoder_output->ne[1], beam_size);
     ggml_tensor* shape = ggml_new_tensor_3d(ctx, GGML_TYPE_I8, encoder_output->ne[0], encoder_output->ne[1], beam_size);
-
     // (S_enc, M) -> (B, S_enc, M)
     // (S_enc, M) -> (B, S_enc, M)
     *encoder_output_out = ggml_repeat(ctx, encoder_output, shape);
     *encoder_output_out = ggml_repeat(ctx, encoder_output, shape);
+    // (S_enc) -> (B, S_enc)
+    ggml_tensor* shape_mask = ggml_new_tensor_2d(ctx, GGML_TYPE_I8, encoder_padding_mask->ne[0], beam_size);
     if (encoder_padding_mask != nullptr) {
     if (encoder_padding_mask != nullptr) {
-        *encoder_padding_mask_out = ggml_repeat(ctx, encoder_padding_mask, shape);
+        *encoder_padding_mask_out = ggml_repeat(ctx, encoder_padding_mask, shape_mask);
     }
     }
 }
 }
 
 
@@ -464,7 +534,7 @@ void _bootstrap_seqs_and_scores(
     ggml_context* ctx = model.ctx;
     ggml_context* ctx = model.ctx;
 
 
     // seqs[:, : prefix_seq_len] = job.prefix_seq;
     // seqs[:, : prefix_seq_len] = job.prefix_seq;
-    ggml_cpy(ctx, job.prefix_seq, ggml_view_2d(ctx, seqs, 0, prefix_seq_len, 0, 0));
+    ggml_cpy(ctx, job.prefix_seq, ggml_view_2d(ctx, seqs, 0, prefix_seq_len, seqs->nb[1], 0));
 
 
     // We have to bootstrap the model with the already fanned-out encoder
     // We have to bootstrap the model with the already fanned-out encoder
     // output to correctly initialize its incremental state. This causes some
     // output to correctly initialize its incremental state. This causes some
@@ -477,7 +547,7 @@ void _bootstrap_seqs_and_scores(
     // Bootstrap the model state with prefix sequence.
     // Bootstrap the model state with prefix sequence.
     ggml_tensor* decoder_output = StandardTransformerDecoder_forward(
     ggml_tensor* decoder_output = StandardTransformerDecoder_forward(
         model,
         model,
-        ".decoder",
+        "text_decoder",
         seqs,
         seqs,
         /*padding_mask*/ nullptr,
         /*padding_mask*/ nullptr,
         encoder_output,
         encoder_output,
@@ -487,7 +557,7 @@ void _bootstrap_seqs_and_scores(
     // TODO state_bag.increment_step(prefix_seq_len - 1)
     // TODO state_bag.increment_step(prefix_seq_len - 1)
 
 
     // logits, lprobs: (N, S_pfx - 1, V)
     // logits, lprobs: (N, S_pfx - 1, V)
-    ggml_tensor* logits = Linear_forward(model, ".decoder.final_proj", decoder_output);
+    ggml_tensor* logits = Linear_forward(model, "final_proj", decoder_output);
     ggml_tensor* lprobs = ggml_log_softmax(ctx, ggml_view_3d(ctx, logits, logits->ne[0], logits->ne[1], 1, 0, 0, 0));
     ggml_tensor* lprobs = ggml_log_softmax(ctx, ggml_view_3d(ctx, logits, logits->ne[0], logits->ne[1], 1, 0, 0, 0));
     int vocab_size = logits->ne[0];
     int vocab_size = logits->ne[0];
 
 
@@ -622,23 +692,29 @@ bool _finalize_hypothesis(
 }
 }
 
 
 /// Generates a translation for a single sequence
 /// Generates a translation for a single sequence
+// TODO: finish this for beam_size=1
+// * implement the lprobs tweaking
+// TODO: add IncrementalStateBag support to avoid a O(N^3) generation.
+// TODO: support beam_size > 1:
+// * most layers assume un-batched input, but we want to handle several beams at once
+// * need to port "reorder_state_dict"
+// * once beam are selected with topk, we need to update seqs and scores tensors
 extern "C" float generate_sequence(
 extern "C" float generate_sequence(
     fairseq2_model& model,
     fairseq2_model& model,
     const SequenceGeneratorJob& job,
     const SequenceGeneratorJob& job,
     ggml_tensor* encoder_output,
     ggml_tensor* encoder_output,
     ggml_tensor* encoder_padding_mask,
     ggml_tensor* encoder_padding_mask,
-    ggml_tensor** output_seq
+    ggml_tensor* output_seq
 ) {
 ) {
-    int input_seq_len = encoder_output->ne[1];
     int vocab_size = encoder_output->ne[0];
     int vocab_size = encoder_output->ne[0];
     int beam_size = job.opts.beam_size;
     int beam_size = job.opts.beam_size;
-    int max_seq_len = _determine_max_seq_len(job);
+    int source_seq_len = encoder_output->ne[1];
+    int max_seq_len = _determine_max_seq_len(job, source_seq_len);
     ggml_context* ctx = model.ctx;
     ggml_context* ctx = model.ctx;
 
 
     // (S_enc, M) -> (B, S_enc, M)
     // (S_enc, M) -> (B, S_enc, M)
     _fan_out_encoder_output(ctx, &encoder_output, &encoder_padding_mask, beam_size);
     _fan_out_encoder_output(ctx, &encoder_output, &encoder_padding_mask, beam_size);
 
 
-    std::vector<Hypothesis> active_searches(beam_size);
     std::vector<Hypothesis> finished_searches(beam_size);
     std::vector<Hypothesis> finished_searches(beam_size);
 
 
     // Initialize buffers. (B, S)
     // Initialize buffers. (B, S)
@@ -688,9 +764,10 @@ extern "C" float generate_sequence(
         //     // state_bag.reorder(beam_indices)
         //     // state_bag.reorder(beam_indices)
         // }
         // }
 
 
+        seqs = TransformerEmbeddingFrontend_forward(model, "text_decoder_frontend", seqs);
         ggml_tensor* decoder_output = StandardTransformerDecoder_forward(
         ggml_tensor* decoder_output = StandardTransformerDecoder_forward(
             model,
             model,
-            ".decoder",
+            "text_decoder",
             // seqs[:, step_nr : step_nr + 1]
             // seqs[:, step_nr : step_nr + 1]
             ggml_view_2d(ctx, seqs, 1, beam_size, step_nr * seqs->nb[0], 0),
             ggml_view_2d(ctx, seqs, 1, beam_size, step_nr * seqs->nb[0], 0),
             nullptr,  // We never generate PAD.
             nullptr,  // We never generate PAD.
@@ -701,7 +778,7 @@ extern "C" float generate_sequence(
 
 
         // state_bag.increment_step()
         // state_bag.increment_step()
 
 
-        ggml_tensor* logits = Linear_forward(model, ".decoder.final_proj", decoder_output);
+        ggml_tensor* logits = Linear_forward(model, "final_proj", decoder_output);
         ggml_tensor* lprobs = ggml_log_softmax(ctx, logits);
         ggml_tensor* lprobs = ggml_log_softmax(ctx, logits);
 
 
         // // Do not allow EOS before reaching the minimum sequence length.
         // // Do not allow EOS before reaching the minimum sequence length.

+ 7 - 2
ggml/examples/unity/fairseq2.h

@@ -55,6 +55,12 @@ extern "C" ggml_tensor* MultiheadAttention_forward(
     ggml_tensor* _ // (klen, slen)  TODO: do we need to pass mask here ?
     ggml_tensor* _ // (klen, slen)  TODO: do we need to pass mask here ?
 );
 );
 
 
+extern "C" ggml_tensor* TransformerEmbeddingFrontend_forward(
+    fairseq2_model& model,
+    const std::string& prefix,
+    ggml_tensor* seqs
+);
+
 extern "C" ggml_tensor* StandardTransformerEncoderLayer_forward(
 extern "C" ggml_tensor* StandardTransformerEncoderLayer_forward(
     fairseq2_model& model,
     fairseq2_model& model,
     const std::string& prefix,
     const std::string& prefix,
@@ -105,7 +111,6 @@ struct SequenceGeneratorOptions {
 struct SequenceGeneratorJob {
 struct SequenceGeneratorJob {
     SequenceGeneratorOptions opts;
     SequenceGeneratorOptions opts;
     ggml_tensor* prefix_seq;
     ggml_tensor* prefix_seq;
-    int source_seq_len;
     std::int32_t eos_idx;
     std::int32_t eos_idx;
 };
 };
 
 
@@ -115,5 +120,5 @@ extern "C" float generate_sequence(
     const SequenceGeneratorJob& opts,
     const SequenceGeneratorJob& opts,
     ggml_tensor* encoder_output,
     ggml_tensor* encoder_output,
     ggml_tensor* encoder_padding_mask,
     ggml_tensor* encoder_padding_mask,
-    ggml_tensor** output_seq
+    ggml_tensor* output_seq
 );
 );

+ 40 - 3
ggml/ggml.py

@@ -16,6 +16,7 @@ from typing import Union
 from typing import Type
 from typing import Type
 
 
 from third_party_ggml import *
 from third_party_ggml import *
+from ctypes_utils import c_struct, c_fn, Ptr
 
 
 ### Helpers
 ### Helpers
 
 
@@ -29,12 +30,17 @@ def numpy_dtype(ggml_type: ctypes.c_int) -> type:
         # GGML_TYPE_F16  = 1,
         # GGML_TYPE_F16  = 1,
         return np.float16
         return np.float16
 
 
+    if ggml_type == 18:
+        return np.int32
+
     raise NotImplementedError(f"Can't convert GGML_TYPE({ggml_type}) to a numpy.dtype")
     raise NotImplementedError(f"Can't convert GGML_TYPE({ggml_type}) to a numpy.dtype")
 
 
 
 
 def from_numpy_dtype(dtype: np.dtype) -> ctypes.c_int:
 def from_numpy_dtype(dtype: np.dtype) -> ctypes.c_int:
     if dtype == np.float32:
     if dtype == np.float32:
         return ctypes.c_int(0)
         return ctypes.c_int(0)
+    elif dtype == np.int32:
+        return ctypes.c_int(18)
     elif dtype == np.float16:
     elif dtype == np.float16:
         return ctypes.c_int(1)
         return ctypes.c_int(1)
     raise NotImplementedError(f"Can't convert {dtype} to a GGML_TYPE")
     raise NotImplementedError(f"Can't convert {dtype} to a GGML_TYPE")
@@ -288,8 +294,39 @@ def forward(
     with CppStr(prefix) as std_prefix:
     with CppStr(prefix) as std_prefix:
         return fwd(model, std_prefix, *inputs)  # ignore: type[no-any-return]
         return fwd(model, std_prefix, *inputs)  # ignore: type[no-any-return]
 
 
-lib.causal_attention_mask.argtypes = [ggml_context_p, ctypes.POINTER(ggml_tensor)]
-lib.causal_attention_mask.restype = ctypes.POINTER(ggml_tensor)
 
 
-def causal_attention_mask(ctx: ggml_context_p, seqs: ggml_tensor_p) -> ggml_tensor_p:
+@c_fn(lib)
+def causal_attention_mask(
+    ctx: ggml_context_p, seqs: Ptr[ggml_tensor]
+) -> Ptr[ggml_tensor]:
     return lib.causal_attention_mask(ctx, seqs)  # type: ignore[no-any-return]
     return lib.causal_attention_mask(ctx, seqs)  # type: ignore[no-any-return]
+
+
+@c_struct
+class SequenceGeneratorOptions:
+    beam_size: int
+    min_seq_len: int
+    soft_max_seq_len_a: int
+    soft_max_seq_len_b: int
+    hard_max_seq_len: int
+    len_penalty: float
+    unk_penalty: float
+    normalize_scores: bool
+
+
+@c_struct
+class SequenceGeneratorJob:
+    opts: SequenceGeneratorOptions
+    prefix_seq: Ptr[ggml_tensor]
+    eos_idx: int
+
+
+@c_fn(lib)
+def generate_sequence(
+    model: ctypes.c_void_p,
+    job: Ptr[SequenceGeneratorJob],
+    encoder_output: Ptr[ggml_tensor],
+    encoder_padding_mask: Ptr[ggml_tensor],
+    output_seq: Ptr[ggml_tensor],
+) -> float:
+    ...

+ 54 - 3
ggml/ggml_convert.py

@@ -11,10 +11,12 @@ from enum import Enum
 from io import BufferedWriter
 from io import BufferedWriter
 from pathlib import Path
 from pathlib import Path
 from typing import Any, Callable, Dict, Optional, Tuple, Union
 from typing import Any, Callable, Dict, Optional, Tuple, Union
-
 import torch
 import torch
 import ggml
 import ggml
+from typing import List
 from fairseq2.assets import AssetCard
 from fairseq2.assets import AssetCard
+from fairseq2.models.transformer.frontend import TransformerEmbeddingFrontend
+from fairseq2.nn import SinusoidalPositionEncoder
 from seamless_communication.models.unity import load_unity_config, load_unity_model
 from seamless_communication.models.unity import load_unity_config, load_unity_model
 
 
 Preprocessor = Callable[[Any], Any]
 Preprocessor = Callable[[Any], Any]
@@ -33,13 +35,59 @@ def convert_model(model_name: str, out: Optional[Path] = None) -> None:
     else:
     else:
         raise ValueError(f"Unsupported model type: {model_name}")
         raise ValueError(f"Unsupported model type: {model_name}")
 
 
+    state_dict = model.state_dict()
+    fixup_model(model, state_dict)
+
     with out.open("wb") as o:
     with out.open("wb") as o:
-        write_ggml_file(o, hparams, model.state_dict())
+        write_ggml_file(o, hparams, state_dict)
 
 
     with out.with_suffix(".hparams.h").open("w") as h:
     with out.with_suffix(".hparams.h").open("w") as h:
         h.write(generate_hparams_struct(hparams, "unity_hparams"))
         h.write(generate_hparams_struct(hparams, "unity_hparams"))
 
 
 
 
+def _nested_getattr(model: Any, name: str) -> Any:
+    parts = name.split(".")
+    node = model
+    for part in parts:
+        node = getattr(node, part)
+        if node is None:
+            return None
+    return node
+
+
+def find_children(model: torch.nn.Module, t: type) -> List[Tuple[str, torch.nn.Module]]:
+    queue = list(model._modules.items())
+    modules = []
+    while queue:
+        name, node = queue.pop()
+        if node is None:
+            continue
+        if isinstance(node, t):
+            modules.append((name, node))
+        for child_name, child_node in node._modules.items():
+            queue.append((".".join((name, child_name)), child_node))
+
+    return modules
+
+
+def fixup_model(model: torch.nn.Module, state_dict: Dict[str, torch.Tensor]) -> None:
+    # Bake the embedding scaling into the weights
+    frontends = find_children(model, TransformerEmbeddingFrontend)
+    print("Upgrading the following TransformerEmbeddingFrontend:", [x[0] for x in frontends])
+    for name, frontend in frontends:
+        embed_weights = state_dict[name + ".embed.weight"]
+        state_dict[name + ".embed.weight"] = embed_weights * frontend.scale
+
+    # Sinusoidal embeddings are typically not saved since they are easily recomputed,
+    # but this allows to avoid porting the sinusoidal logic to GGML
+    pos_encoders = find_children(model, SinusoidalPositionEncoder)
+    print("Upgrading the following SinusoidalPositionEncoder:", [x[0] for x in pos_encoders])
+    for name, pos_encoder in pos_encoders:
+        assert isinstance(pos_encoder.weight, torch.Tensor)
+        assert name not in state_dict
+        state_dict[name] = pos_encoder.weight
+
+
 def write_ggml_file(
 def write_ggml_file(
     out: BufferedWriter, hparams: Dict[str, Any], state_dict: Dict[str, torch.Tensor]
     out: BufferedWriter, hparams: Dict[str, Any], state_dict: Dict[str, torch.Tensor]
 ) -> None:
 ) -> None:
@@ -52,7 +100,9 @@ def write_ggml_file(
         # + tensor overhead
         # + tensor overhead
         byte_size += ggml.ggml_tensor_overhead() * (len(state_dict) + 10)
         byte_size += ggml.ggml_tensor_overhead() * (len(state_dict) + 10)
         hparams["model_byte_size"] = byte_size
         hparams["model_byte_size"] = byte_size
-        logging.warning(f"Saving a ggml file with {len(state_dict)} tensors, for an estimated amount of {byte_size / (1024**3)} GGML Gb")
+        logging.warning(
+            f"Saving a ggml file with {len(state_dict)} tensors, for an estimated amount of {byte_size / (1024**3)} GGML Gb"
+        )
     # 6877961321223123048
     # 6877961321223123048
     hparams["__end_of_hparams__"] = struct.unpack("l", b"hparams_")[0]
     hparams["__end_of_hparams__"] = struct.unpack("l", b"hparams_")[0]
 
 
@@ -140,6 +190,7 @@ def write_tensor(out: BufferedWriter, value: torch.Tensor) -> None:
 
 
     data.tofile(out)
     data.tofile(out)
 
 
+
 def torch_to_ggml_type(dtype: type) -> int:
 def torch_to_ggml_type(dtype: type) -> int:
     if dtype is torch.float32:
     if dtype is torch.float32:
         return ggml.GGML_TYPE_F32
         return ggml.GGML_TYPE_F32

+ 153 - 7
ggml/test_unity_cpp.py

@@ -6,13 +6,17 @@ import numpy as np
 import torch
 import torch
 import fairseq2.nn
 import fairseq2.nn
 import fairseq2.nn.transformer
 import fairseq2.nn.transformer
+import logging
+import sys
+from pathlib import Path
+from ctypes_utils import Ptr
 from ctypes import c_void_p
 from ctypes import c_void_p
 from typing import Any
 from typing import Any
 from pathlib import Path
 from pathlib import Path
 from typing import Iterator
 from typing import Iterator
 from ggml import NativeObj
 from ggml import NativeObj
 from ggml_convert import convert_model
 from ggml_convert import convert_model
-from seamless_communication.models.unity import load_unity_model
+from seamless_communication.models.inference.translator import Translator, Modality
 
 
 Ctx = ggml.ggml_context_p
 Ctx = ggml.ggml_context_p
 
 
@@ -276,12 +280,19 @@ def g_model(ctx: Ctx, g_model_once: c_void_p) -> c_void_p:
 
 
 
 
 @pytest.fixture(scope="module")
 @pytest.fixture(scope="module")
-def pt_model() -> Iterator[Any]:
-    model = load_unity_model("seamlessM4T_medium")
-    print(model)
-    model.eval()
+def translator() -> Iterator[Any]:
+    tr = Translator(
+        "seamlessM4T_medium", "vocoder_36langs", torch.device("cpu"), torch.float32
+    )
     with torch.inference_mode():
     with torch.inference_mode():
-        yield model
+        yield tr
+
+
+@pytest.fixture(scope="module")
+def pt_model(translator: Translator) -> Any:
+    model = translator.model
+    print(model)
+    return model
 
 
 
 
 @pytest.mark.xfail(reason="TODO")
 @pytest.mark.xfail(reason="TODO")
@@ -551,6 +562,46 @@ def test_causal_attention_mask(ctx: Ctx):
     assert np.allclose(mask, mask_exp)
     assert np.allclose(mask, mask_exp)
 
 
 
 
+def test_PositionalEmbedding_forward(ctx: Ctx, g_model: c_void_p) -> None:
+    seq = torch.zeros((4, 20, 1024), dtype=torch.float32)
+    # this _legacy_pad_idx is suspicious. Shouldn't the model use 1 ? But
+    # this is consistent with pt_model.text_decoder_frontend.pos_encoder._sin_offset
+    pos_encoder = fairseq2.nn.SinusoidalPositionEncoder(1024, 55, _legacy_pad_idx=0)
+    y_exp = pos_encoder(seq, None)[0].numpy()
+
+    gseq = ggml.from_numpy(ctx, seq[0].numpy())
+    ggml.ggml_set_name(gseq, b"seq")
+    gy = ggml.forward(
+        "PositionalEmbedding", g_model, "text_decoder_frontend.pos_encoder", gseq
+    )
+    gf = ggml.ggml_build_forward(gy)
+    ggml.ggml_graph_compute_with_ctx(ctx, ctypes.pointer(gf), 1)
+    y = ggml.to_numpy(gy)
+
+    assert y.shape == y_exp.shape
+    assert np.allclose(y_exp, y, atol=1e-6)
+
+
+def test_TransformerEmbeddingFrontend_forward(
+    ctx: Ctx, g_model: c_void_p, pt_model: Any
+) -> None:
+    seq = torch.arange(20).reshape(1, 20)
+    seq_len = torch.tensor([20])
+    gseq = ggml.from_numpy(ctx, seq[0].numpy().astype(np.int32))
+    ggml.ggml_set_name(gseq, b"seq")
+    gy = ggml.forward(
+        "TransformerEmbeddingFrontend", g_model, "text_decoder_frontend", gseq
+    )
+    gf = ggml.ggml_build_forward(gy)
+    ggml.ggml_graph_compute_with_ctx(ctx, ctypes.pointer(gf), 1)
+    y = ggml.to_numpy(gy)
+
+    y_exp, _ = pt_model.text_decoder_frontend(seq, seq_len)
+    y_exp = y_exp.squeeze(0).numpy()  # remove batch dimension
+
+    assert y.shape == y_exp.shape
+    assert np.allclose(y_exp, y, atol=1e-6)
+
 
 
 def test_StandardTransformerDecoder_forward(
 def test_StandardTransformerDecoder_forward(
     ctx: Ctx, g_model: c_void_p, pt_model: Any
     ctx: Ctx, g_model: c_void_p, pt_model: Any
@@ -577,7 +628,6 @@ def test_StandardTransformerDecoder_forward(
     )
     )
     gf = ggml.ggml_build_forward(gy)
     gf = ggml.ggml_build_forward(gy)
     ggml.ggml_graph_compute_with_ctx(ctx, ctypes.pointer(gf), 1)
     ggml.ggml_graph_compute_with_ctx(ctx, ctypes.pointer(gf), 1)
-
     y = ggml.to_numpy(gy)
     y = ggml.to_numpy(gy)
 
 
     y_exp, _ = pt_model.text_decoder(x, padding_mask, encoder_out, None)
     y_exp, _ = pt_model.text_decoder(x, padding_mask, encoder_out, None)
@@ -585,3 +635,99 @@ def test_StandardTransformerDecoder_forward(
 
 
     assert y.shape == y_exp.shape
     assert y.shape == y_exp.shape
     assert np.allclose(y_exp, y, atol=1e-4)
     assert np.allclose(y_exp, y, atol=1e-4)
+
+
+def test_t2tt(ctx: Ctx, g_model: c_void_p):
+    # device = translator.device
+    src_lang = "eng"
+    src_text = "We are all in a yellow submarine."
+    tgt_lang = "fra"
+    # token_encoder = translator.text_tokenizer.create_encoder(
+    #     task="translation", lang=src_lang, mode="source", device=device
+    # )
+    # src = translator.collate(token_encoder(src_text))
+
+    # text_out, _ = translator.get_prediction(
+    #     translator.model,
+    #     translator.text_tokenizer,
+    #     translator.unit_tokenizer,
+    #     src,
+    #     input_modality=Modality.TEXT,
+    #     output_modality=Modality.TEXT,
+    #     tgt_lang=tgt_lang,
+    # )
+
+    # tgt_text = str(text_out.sentences[0])
+    # assert tgt_text == "Nous sommes tous dans un sous-marin jaune."
+    # tgt_tokens = text_out.generator_output.results[0][0].seq
+    # score = text_out.generator_output.results[0][0].score.item()
+    # np.savez(
+    #     Path(__file__).parent / "sample_input.npz",
+    #     score=score,
+    #     encoder_output=text_out.encoder_output.squeeze(0).numpy(),
+    #     encoder_padding_mask=text_out.encoder_padding_mask.squeeze(0).numpy(),
+    #     tgt_tokens=tgt_tokens.numpy(),
+    # )
+
+    text_out = np.load(Path(__file__).parent / "sample_input.npz")
+    score = text_out["score"].item()
+
+    tgt_tokens = ggml.from_numpy(ctx, text_out["tgt_tokens"].astype(np.int32))
+    encoder_out = ggml.from_numpy(ctx, text_out["encoder_output"])
+    encoder_padding_mask = ggml.from_numpy(ctx, text_out["encoder_padding_mask"])
+
+    job = ggml.SequenceGeneratorJob()
+    job.opts.beam_size = 1
+    job.opts.min_seq_len = 1
+    job.opts.soft_max_seq_len_a = 1
+    job.opts.soft_max_seq_len_b = 200
+    job.opts.hard_max_seq_len = 1024
+    job.opts.len_penalty = 1.0
+    job.opts.unk_penalty = 0.0
+    job.prefix_seq = ggml.from_numpy(ctx, text_out["tgt_tokens"].astype(np.int32)[:1])
+    job.eos_idx = 3
+
+    result = ctypes.byref(ggml.ggml_tensor())
+    g_score = ggml.generate_sequence(
+        g_model, job, encoder_out, encoder_padding_mask, result
+    )
+    breakpoint()
+    assert g_score == pytest.approx(score)
+
+
+def test_in_loop(ctx: Ctx, g_model: c_void_p, pt_model: Any):
+    resources = locals()
+
+    import importlib
+    import time
+
+    testcase = test_TransformerEmbeddingFrontend_forward.__name__
+    name, script = __name__, __file__
+    root = Path(__file__).parent
+    watched_files = [Path(__file__), root / "ggml.py", root / "build/src/libggml.so"]
+    last_try = 0.0
+
+    while True:
+        last_save = max(f.stat().st_mtime for f in watched_files)
+        if last_save <= last_try:
+            time.sleep(0.1)
+            continue
+
+        last_try = last_save
+        spec = importlib.util.spec_from_file_location(name, script)
+        module = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(module)
+        sys.modules[name] = module
+        f = getattr(module, testcase)
+        f_args = [k for k in f.__annotations__ if k != "return"]
+        try:
+            f(**{k: resources[k] for k in f_args})
+            print(f"Testcase {testcase} success")
+        except AssertionError as e:
+            print(f"Testcase {testcase} failed: {e}")
+
+        except Exception as e:
+            import pdb
+
+            logging.exception(f"Testcase {testcase} crashed !")
+            pdb.post_mortem()