Browse Source

SinusoidalPositionEncoder + WIP: TransformerEmbeddingFrontend

Guillaume Wenzek 1 year ago
parent
commit
2238cea072
5 changed files with 346 additions and 30 deletions
  1. 92 15
      ggml/examples/unity/fairseq2.cpp
  2. 7 2
      ggml/examples/unity/fairseq2.h
  3. 40 3
      ggml/ggml.py
  4. 54 3
      ggml/ggml_convert.py
  5. 153 7
      ggml/test_unity_cpp.py

+ 92 - 15
ggml/examples/unity/fairseq2.cpp

@@ -59,7 +59,8 @@ extern "C" ggml_tensor* Linear_forward(
 extern "C" ggml_tensor* LayerNorm_forward(
     fairseq2_model& model,
     const std::string &prefix,
-    ggml_tensor* input) {
+    ggml_tensor* input
+) {
     ggml_tensor* weight = model.tensors[prefix + ".weight"];
     GGML_ASSERT(weight != nullptr);
     ggml_tensor* bias = model.tensors[prefix + ".bias"];
@@ -222,6 +223,74 @@ extern "C" ggml_tensor* StandardTransformerEncoderLayer_forward(
     return seqs;
 }
 
+struct ggml_tensor * ggml_slice(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int axis,
+        int64_t               start,
+        int64_t               end
+    ) {
+    int64_t ne[4];
+    std::copy(a->ne, a->ne + 4, ne);
+    if (start < 0) start = ne[axis] + start;
+    if (end < 0) end = ne[axis] + end;
+    GGML_ASSERT(0 <= start);
+    GGML_ASSERT(start <= end);
+    GGML_ASSERT(end <= ne[axis]);
+
+    ne[axis] = end - start;
+    size_t offset = a->nb[axis] * start;
+
+    size_t* nb = a->nb;
+    ggml_tensor* result = ggml_view_4d(ctx, a, ne[0], ne[1], ne[2], ne[3], nb[1], nb[2], nb[3], offset);
+    result->n_dims = a->n_dims;
+    return result;
+}
+
+
+extern "C" ggml_tensor* PositionalEmbedding_forward(
+    fairseq2_model& model,
+    const std::string& prefix,
+    ggml_tensor* embeds
+) {
+    int encoding_dim = embeds->ne[0];
+    int seq_len = embeds->ne[1];
+    ggml_tensor* full_pos_embeds = model.tensors[prefix];
+    ggml_tensor* pos_embeds = ggml_slice(model.ctx, full_pos_embeds, /*axis*/1, 0, seq_len);
+    return ggml_add(model.ctx, embeds, pos_embeds);
+}
+
+extern "C" ggml_tensor* TransformerEmbeddingFrontend_forward(
+    fairseq2_model& model,
+    const std::string& prefix,
+    ggml_tensor* seqs
+    // TODO: state_bag
+) {
+    ggml_context* ctx = model.ctx;
+    ggml_tensor* embed_weights = model.tensors[prefix + ".embed.weight"];
+    GGML_ASSERT(embed_weights != nullptr);
+    ggml_tensor* embeds = ggml_get_rows(ctx, embed_weights, seqs);
+
+    // padding_mask = to_padding_mask(embeds, seq_lens)
+
+    // TODO: scale when saving the model weights
+    // embeds = ggml_scale embeds * self.scale
+
+    if (has_layer(model, prefix + ".pos_encoder")) {
+        // This only work with the simple pos encoders
+        int encoding_dim = embeds->ne[0];
+        int seq_len = embeds->ne[1];
+       ggml_tensor* pos_embeds = ggml_view_2d(ctx, model.tensors[prefix + ".pos_encoder"], encoding_dim, seq_len, 0, 0);
+        embeds = ggml_add(ctx, embeds, pos_embeds);
+    }
+
+    if (has_layer(model, prefix + ".layer_norm")) {
+        embeds = LayerNorm_forward(model, prefix + ".layer_norm", embeds);
+    }
+
+    // padding mask ?
+    return embeds;
+}
 
 extern "C" ggml_tensor* StandardTransformerEncoder_forward(
     fairseq2_model& model,
@@ -389,13 +458,13 @@ extern "C" ggml_tensor* StandardTransformerDecoder_forward(
 using IncrementalStateBag = std::unordered_map<ggml_tensor*, ggml_tensor*>*;
 
 
-int _determine_max_seq_len(const SequenceGeneratorJob& job) {
+int _determine_max_seq_len(const SequenceGeneratorJob& job, int source_seq_len) {
     auto opts = job.opts;
     int max_seq_len = -1;
-    if (job.source_seq_len <= 0 || opts.soft_max_seq_len_a <= 0) {
+    if (source_seq_len <= 0 || opts.soft_max_seq_len_a <= 0) {
         max_seq_len = opts.hard_max_seq_len;
     } else {
-        max_seq_len = std::min(opts.hard_max_seq_len, int(opts.soft_max_seq_len_a * job.source_seq_len + opts.soft_max_seq_len_b));
+        max_seq_len = std::min(opts.hard_max_seq_len, int(opts.soft_max_seq_len_a * source_seq_len + opts.soft_max_seq_len_b));
     }
 
     if (opts.min_seq_len > max_seq_len) {
@@ -432,11 +501,12 @@ void _fan_out_encoder_output(
 
     // (B, S_enc, M)
     ggml_tensor* shape = ggml_new_tensor_3d(ctx, GGML_TYPE_I8, encoder_output->ne[0], encoder_output->ne[1], beam_size);
-
     // (S_enc, M) -> (B, S_enc, M)
     *encoder_output_out = ggml_repeat(ctx, encoder_output, shape);
+    // (S_enc) -> (B, S_enc)
+    ggml_tensor* shape_mask = ggml_new_tensor_2d(ctx, GGML_TYPE_I8, encoder_padding_mask->ne[0], beam_size);
     if (encoder_padding_mask != nullptr) {
-        *encoder_padding_mask_out = ggml_repeat(ctx, encoder_padding_mask, shape);
+        *encoder_padding_mask_out = ggml_repeat(ctx, encoder_padding_mask, shape_mask);
     }
 }
 
@@ -464,7 +534,7 @@ void _bootstrap_seqs_and_scores(
     ggml_context* ctx = model.ctx;
 
     // seqs[:, : prefix_seq_len] = job.prefix_seq;
-    ggml_cpy(ctx, job.prefix_seq, ggml_view_2d(ctx, seqs, 0, prefix_seq_len, 0, 0));
+    ggml_cpy(ctx, job.prefix_seq, ggml_view_2d(ctx, seqs, 0, prefix_seq_len, seqs->nb[1], 0));
 
     // We have to bootstrap the model with the already fanned-out encoder
     // output to correctly initialize its incremental state. This causes some
@@ -477,7 +547,7 @@ void _bootstrap_seqs_and_scores(
     // Bootstrap the model state with prefix sequence.
     ggml_tensor* decoder_output = StandardTransformerDecoder_forward(
         model,
-        ".decoder",
+        "text_decoder",
         seqs,
         /*padding_mask*/ nullptr,
         encoder_output,
@@ -487,7 +557,7 @@ void _bootstrap_seqs_and_scores(
     // TODO state_bag.increment_step(prefix_seq_len - 1)
 
     // logits, lprobs: (N, S_pfx - 1, V)
-    ggml_tensor* logits = Linear_forward(model, ".decoder.final_proj", decoder_output);
+    ggml_tensor* logits = Linear_forward(model, "final_proj", decoder_output);
     ggml_tensor* lprobs = ggml_log_softmax(ctx, ggml_view_3d(ctx, logits, logits->ne[0], logits->ne[1], 1, 0, 0, 0));
     int vocab_size = logits->ne[0];
 
@@ -622,23 +692,29 @@ bool _finalize_hypothesis(
 }
 
 /// Generates a translation for a single sequence
+// TODO: finish this for beam_size=1
+// * implement the lprobs tweaking
+// TODO: add IncrementalStateBag support to avoid a O(N^3) generation.
+// TODO: support beam_size > 1:
+// * most layers assume un-batched input, but we want to handle several beams at once
+// * need to port "reorder_state_dict"
+// * once beam are selected with topk, we need to update seqs and scores tensors
 extern "C" float generate_sequence(
     fairseq2_model& model,
     const SequenceGeneratorJob& job,
     ggml_tensor* encoder_output,
     ggml_tensor* encoder_padding_mask,
-    ggml_tensor** output_seq
+    ggml_tensor* output_seq
 ) {
-    int input_seq_len = encoder_output->ne[1];
     int vocab_size = encoder_output->ne[0];
     int beam_size = job.opts.beam_size;
-    int max_seq_len = _determine_max_seq_len(job);
+    int source_seq_len = encoder_output->ne[1];
+    int max_seq_len = _determine_max_seq_len(job, source_seq_len);
     ggml_context* ctx = model.ctx;
 
     // (S_enc, M) -> (B, S_enc, M)
     _fan_out_encoder_output(ctx, &encoder_output, &encoder_padding_mask, beam_size);
 
-    std::vector<Hypothesis> active_searches(beam_size);
     std::vector<Hypothesis> finished_searches(beam_size);
 
     // Initialize buffers. (B, S)
@@ -688,9 +764,10 @@ extern "C" float generate_sequence(
         //     // state_bag.reorder(beam_indices)
         // }
 
+        seqs = TransformerEmbeddingFrontend_forward(model, "text_decoder_frontend", seqs);
         ggml_tensor* decoder_output = StandardTransformerDecoder_forward(
             model,
-            ".decoder",
+            "text_decoder",
             // seqs[:, step_nr : step_nr + 1]
             ggml_view_2d(ctx, seqs, 1, beam_size, step_nr * seqs->nb[0], 0),
             nullptr,  // We never generate PAD.
@@ -701,7 +778,7 @@ extern "C" float generate_sequence(
 
         // state_bag.increment_step()
 
-        ggml_tensor* logits = Linear_forward(model, ".decoder.final_proj", decoder_output);
+        ggml_tensor* logits = Linear_forward(model, "final_proj", decoder_output);
         ggml_tensor* lprobs = ggml_log_softmax(ctx, logits);
 
         // // Do not allow EOS before reaching the minimum sequence length.

+ 7 - 2
ggml/examples/unity/fairseq2.h

@@ -55,6 +55,12 @@ extern "C" ggml_tensor* MultiheadAttention_forward(
     ggml_tensor* _ // (klen, slen)  TODO: do we need to pass mask here ?
 );
 
+extern "C" ggml_tensor* TransformerEmbeddingFrontend_forward(
+    fairseq2_model& model,
+    const std::string& prefix,
+    ggml_tensor* seqs
+);
+
 extern "C" ggml_tensor* StandardTransformerEncoderLayer_forward(
     fairseq2_model& model,
     const std::string& prefix,
@@ -105,7 +111,6 @@ struct SequenceGeneratorOptions {
 struct SequenceGeneratorJob {
     SequenceGeneratorOptions opts;
     ggml_tensor* prefix_seq;
-    int source_seq_len;
     std::int32_t eos_idx;
 };
 
@@ -115,5 +120,5 @@ extern "C" float generate_sequence(
     const SequenceGeneratorJob& opts,
     ggml_tensor* encoder_output,
     ggml_tensor* encoder_padding_mask,
-    ggml_tensor** output_seq
+    ggml_tensor* output_seq
 );

+ 40 - 3
ggml/ggml.py

@@ -16,6 +16,7 @@ from typing import Union
 from typing import Type
 
 from third_party_ggml import *
+from ctypes_utils import c_struct, c_fn, Ptr
 
 ### Helpers
 
@@ -29,12 +30,17 @@ def numpy_dtype(ggml_type: ctypes.c_int) -> type:
         # GGML_TYPE_F16  = 1,
         return np.float16
 
+    if ggml_type == 18:
+        return np.int32
+
     raise NotImplementedError(f"Can't convert GGML_TYPE({ggml_type}) to a numpy.dtype")
 
 
 def from_numpy_dtype(dtype: np.dtype) -> ctypes.c_int:
     if dtype == np.float32:
         return ctypes.c_int(0)
+    elif dtype == np.int32:
+        return ctypes.c_int(18)
     elif dtype == np.float16:
         return ctypes.c_int(1)
     raise NotImplementedError(f"Can't convert {dtype} to a GGML_TYPE")
@@ -288,8 +294,39 @@ def forward(
     with CppStr(prefix) as std_prefix:
         return fwd(model, std_prefix, *inputs)  # ignore: type[no-any-return]
 
-lib.causal_attention_mask.argtypes = [ggml_context_p, ctypes.POINTER(ggml_tensor)]
-lib.causal_attention_mask.restype = ctypes.POINTER(ggml_tensor)
 
-def causal_attention_mask(ctx: ggml_context_p, seqs: ggml_tensor_p) -> ggml_tensor_p:
+@c_fn(lib)
+def causal_attention_mask(
+    ctx: ggml_context_p, seqs: Ptr[ggml_tensor]
+) -> Ptr[ggml_tensor]:
     return lib.causal_attention_mask(ctx, seqs)  # type: ignore[no-any-return]
+
+
+@c_struct
+class SequenceGeneratorOptions:
+    beam_size: int
+    min_seq_len: int
+    soft_max_seq_len_a: int
+    soft_max_seq_len_b: int
+    hard_max_seq_len: int
+    len_penalty: float
+    unk_penalty: float
+    normalize_scores: bool
+
+
+@c_struct
+class SequenceGeneratorJob:
+    opts: SequenceGeneratorOptions
+    prefix_seq: Ptr[ggml_tensor]
+    eos_idx: int
+
+
+@c_fn(lib)
+def generate_sequence(
+    model: ctypes.c_void_p,
+    job: Ptr[SequenceGeneratorJob],
+    encoder_output: Ptr[ggml_tensor],
+    encoder_padding_mask: Ptr[ggml_tensor],
+    output_seq: Ptr[ggml_tensor],
+) -> float:
+    ...

+ 54 - 3
ggml/ggml_convert.py

@@ -11,10 +11,12 @@ from enum import Enum
 from io import BufferedWriter
 from pathlib import Path
 from typing import Any, Callable, Dict, Optional, Tuple, Union
-
 import torch
 import ggml
+from typing import List
 from fairseq2.assets import AssetCard
+from fairseq2.models.transformer.frontend import TransformerEmbeddingFrontend
+from fairseq2.nn import SinusoidalPositionEncoder
 from seamless_communication.models.unity import load_unity_config, load_unity_model
 
 Preprocessor = Callable[[Any], Any]
@@ -33,13 +35,59 @@ def convert_model(model_name: str, out: Optional[Path] = None) -> None:
     else:
         raise ValueError(f"Unsupported model type: {model_name}")
 
+    state_dict = model.state_dict()
+    fixup_model(model, state_dict)
+
     with out.open("wb") as o:
-        write_ggml_file(o, hparams, model.state_dict())
+        write_ggml_file(o, hparams, state_dict)
 
     with out.with_suffix(".hparams.h").open("w") as h:
         h.write(generate_hparams_struct(hparams, "unity_hparams"))
 
 
+def _nested_getattr(model: Any, name: str) -> Any:
+    parts = name.split(".")
+    node = model
+    for part in parts:
+        node = getattr(node, part)
+        if node is None:
+            return None
+    return node
+
+
+def find_children(model: torch.nn.Module, t: type) -> List[Tuple[str, torch.nn.Module]]:
+    queue = list(model._modules.items())
+    modules = []
+    while queue:
+        name, node = queue.pop()
+        if node is None:
+            continue
+        if isinstance(node, t):
+            modules.append((name, node))
+        for child_name, child_node in node._modules.items():
+            queue.append((".".join((name, child_name)), child_node))
+
+    return modules
+
+
+def fixup_model(model: torch.nn.Module, state_dict: Dict[str, torch.Tensor]) -> None:
+    # Bake the embedding scaling into the weights
+    frontends = find_children(model, TransformerEmbeddingFrontend)
+    print("Upgrading the following TransformerEmbeddingFrontend:", [x[0] for x in frontends])
+    for name, frontend in frontends:
+        embed_weights = state_dict[name + ".embed.weight"]
+        state_dict[name + ".embed.weight"] = embed_weights * frontend.scale
+
+    # Sinusoidal embeddings are typically not saved since they are easily recomputed,
+    # but this allows to avoid porting the sinusoidal logic to GGML
+    pos_encoders = find_children(model, SinusoidalPositionEncoder)
+    print("Upgrading the following SinusoidalPositionEncoder:", [x[0] for x in pos_encoders])
+    for name, pos_encoder in pos_encoders:
+        assert isinstance(pos_encoder.weight, torch.Tensor)
+        assert name not in state_dict
+        state_dict[name] = pos_encoder.weight
+
+
 def write_ggml_file(
     out: BufferedWriter, hparams: Dict[str, Any], state_dict: Dict[str, torch.Tensor]
 ) -> None:
@@ -52,7 +100,9 @@ def write_ggml_file(
         # + tensor overhead
         byte_size += ggml.ggml_tensor_overhead() * (len(state_dict) + 10)
         hparams["model_byte_size"] = byte_size
-        logging.warning(f"Saving a ggml file with {len(state_dict)} tensors, for an estimated amount of {byte_size / (1024**3)} GGML Gb")
+        logging.warning(
+            f"Saving a ggml file with {len(state_dict)} tensors, for an estimated amount of {byte_size / (1024**3)} GGML Gb"
+        )
     # 6877961321223123048
     hparams["__end_of_hparams__"] = struct.unpack("l", b"hparams_")[0]
 
@@ -140,6 +190,7 @@ def write_tensor(out: BufferedWriter, value: torch.Tensor) -> None:
 
     data.tofile(out)
 
+
 def torch_to_ggml_type(dtype: type) -> int:
     if dtype is torch.float32:
         return ggml.GGML_TYPE_F32

+ 153 - 7
ggml/test_unity_cpp.py

@@ -6,13 +6,17 @@ import numpy as np
 import torch
 import fairseq2.nn
 import fairseq2.nn.transformer
+import logging
+import sys
+from pathlib import Path
+from ctypes_utils import Ptr
 from ctypes import c_void_p
 from typing import Any
 from pathlib import Path
 from typing import Iterator
 from ggml import NativeObj
 from ggml_convert import convert_model
-from seamless_communication.models.unity import load_unity_model
+from seamless_communication.models.inference.translator import Translator, Modality
 
 Ctx = ggml.ggml_context_p
 
@@ -276,12 +280,19 @@ def g_model(ctx: Ctx, g_model_once: c_void_p) -> c_void_p:
 
 
 @pytest.fixture(scope="module")
-def pt_model() -> Iterator[Any]:
-    model = load_unity_model("seamlessM4T_medium")
-    print(model)
-    model.eval()
+def translator() -> Iterator[Any]:
+    tr = Translator(
+        "seamlessM4T_medium", "vocoder_36langs", torch.device("cpu"), torch.float32
+    )
     with torch.inference_mode():
-        yield model
+        yield tr
+
+
+@pytest.fixture(scope="module")
+def pt_model(translator: Translator) -> Any:
+    model = translator.model
+    print(model)
+    return model
 
 
 @pytest.mark.xfail(reason="TODO")
@@ -551,6 +562,46 @@ def test_causal_attention_mask(ctx: Ctx):
     assert np.allclose(mask, mask_exp)
 
 
+def test_PositionalEmbedding_forward(ctx: Ctx, g_model: c_void_p) -> None:
+    seq = torch.zeros((4, 20, 1024), dtype=torch.float32)
+    # this _legacy_pad_idx is suspicious. Shouldn't the model use 1 ? But
+    # this is consistent with pt_model.text_decoder_frontend.pos_encoder._sin_offset
+    pos_encoder = fairseq2.nn.SinusoidalPositionEncoder(1024, 55, _legacy_pad_idx=0)
+    y_exp = pos_encoder(seq, None)[0].numpy()
+
+    gseq = ggml.from_numpy(ctx, seq[0].numpy())
+    ggml.ggml_set_name(gseq, b"seq")
+    gy = ggml.forward(
+        "PositionalEmbedding", g_model, "text_decoder_frontend.pos_encoder", gseq
+    )
+    gf = ggml.ggml_build_forward(gy)
+    ggml.ggml_graph_compute_with_ctx(ctx, ctypes.pointer(gf), 1)
+    y = ggml.to_numpy(gy)
+
+    assert y.shape == y_exp.shape
+    assert np.allclose(y_exp, y, atol=1e-6)
+
+
+def test_TransformerEmbeddingFrontend_forward(
+    ctx: Ctx, g_model: c_void_p, pt_model: Any
+) -> None:
+    seq = torch.arange(20).reshape(1, 20)
+    seq_len = torch.tensor([20])
+    gseq = ggml.from_numpy(ctx, seq[0].numpy().astype(np.int32))
+    ggml.ggml_set_name(gseq, b"seq")
+    gy = ggml.forward(
+        "TransformerEmbeddingFrontend", g_model, "text_decoder_frontend", gseq
+    )
+    gf = ggml.ggml_build_forward(gy)
+    ggml.ggml_graph_compute_with_ctx(ctx, ctypes.pointer(gf), 1)
+    y = ggml.to_numpy(gy)
+
+    y_exp, _ = pt_model.text_decoder_frontend(seq, seq_len)
+    y_exp = y_exp.squeeze(0).numpy()  # remove batch dimension
+
+    assert y.shape == y_exp.shape
+    assert np.allclose(y_exp, y, atol=1e-6)
+
 
 def test_StandardTransformerDecoder_forward(
     ctx: Ctx, g_model: c_void_p, pt_model: Any
@@ -577,7 +628,6 @@ def test_StandardTransformerDecoder_forward(
     )
     gf = ggml.ggml_build_forward(gy)
     ggml.ggml_graph_compute_with_ctx(ctx, ctypes.pointer(gf), 1)
-
     y = ggml.to_numpy(gy)
 
     y_exp, _ = pt_model.text_decoder(x, padding_mask, encoder_out, None)
@@ -585,3 +635,99 @@ def test_StandardTransformerDecoder_forward(
 
     assert y.shape == y_exp.shape
     assert np.allclose(y_exp, y, atol=1e-4)
+
+
+def test_t2tt(ctx: Ctx, g_model: c_void_p):
+    # device = translator.device
+    src_lang = "eng"
+    src_text = "We are all in a yellow submarine."
+    tgt_lang = "fra"
+    # token_encoder = translator.text_tokenizer.create_encoder(
+    #     task="translation", lang=src_lang, mode="source", device=device
+    # )
+    # src = translator.collate(token_encoder(src_text))
+
+    # text_out, _ = translator.get_prediction(
+    #     translator.model,
+    #     translator.text_tokenizer,
+    #     translator.unit_tokenizer,
+    #     src,
+    #     input_modality=Modality.TEXT,
+    #     output_modality=Modality.TEXT,
+    #     tgt_lang=tgt_lang,
+    # )
+
+    # tgt_text = str(text_out.sentences[0])
+    # assert tgt_text == "Nous sommes tous dans un sous-marin jaune."
+    # tgt_tokens = text_out.generator_output.results[0][0].seq
+    # score = text_out.generator_output.results[0][0].score.item()
+    # np.savez(
+    #     Path(__file__).parent / "sample_input.npz",
+    #     score=score,
+    #     encoder_output=text_out.encoder_output.squeeze(0).numpy(),
+    #     encoder_padding_mask=text_out.encoder_padding_mask.squeeze(0).numpy(),
+    #     tgt_tokens=tgt_tokens.numpy(),
+    # )
+
+    text_out = np.load(Path(__file__).parent / "sample_input.npz")
+    score = text_out["score"].item()
+
+    tgt_tokens = ggml.from_numpy(ctx, text_out["tgt_tokens"].astype(np.int32))
+    encoder_out = ggml.from_numpy(ctx, text_out["encoder_output"])
+    encoder_padding_mask = ggml.from_numpy(ctx, text_out["encoder_padding_mask"])
+
+    job = ggml.SequenceGeneratorJob()
+    job.opts.beam_size = 1
+    job.opts.min_seq_len = 1
+    job.opts.soft_max_seq_len_a = 1
+    job.opts.soft_max_seq_len_b = 200
+    job.opts.hard_max_seq_len = 1024
+    job.opts.len_penalty = 1.0
+    job.opts.unk_penalty = 0.0
+    job.prefix_seq = ggml.from_numpy(ctx, text_out["tgt_tokens"].astype(np.int32)[:1])
+    job.eos_idx = 3
+
+    result = ctypes.byref(ggml.ggml_tensor())
+    g_score = ggml.generate_sequence(
+        g_model, job, encoder_out, encoder_padding_mask, result
+    )
+    breakpoint()
+    assert g_score == pytest.approx(score)
+
+
+def test_in_loop(ctx: Ctx, g_model: c_void_p, pt_model: Any):
+    resources = locals()
+
+    import importlib
+    import time
+
+    testcase = test_TransformerEmbeddingFrontend_forward.__name__
+    name, script = __name__, __file__
+    root = Path(__file__).parent
+    watched_files = [Path(__file__), root / "ggml.py", root / "build/src/libggml.so"]
+    last_try = 0.0
+
+    while True:
+        last_save = max(f.stat().st_mtime for f in watched_files)
+        if last_save <= last_try:
+            time.sleep(0.1)
+            continue
+
+        last_try = last_save
+        spec = importlib.util.spec_from_file_location(name, script)
+        module = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(module)
+        sys.modules[name] = module
+        f = getattr(module, testcase)
+        f_args = [k for k in f.__annotations__ if k != "return"]
+        try:
+            f(**{k: resources[k] for k in f_args})
+            print(f"Testcase {testcase} success")
+        except AssertionError as e:
+            print(f"Testcase {testcase} failed: {e}")
+
+        except Exception as e:
+            import pdb
+
+            logging.exception(f"Testcase {testcase} crashed !")
+            pdb.post_mortem()