Prechádzať zdrojové kódy

Unity inc (#159)

* use a bit more add_inplace

* use blas even short sequences

* enable openblas for x8 speedup :-)

* add_inplace everywhere !

* cleanup deps

* more openblas

* fix test_WaveformToFbankConverter

* optimize (B, 1, D_out) @ (D_in, D_out) matmul

* tweak_lprobs

* SC fixes

* add vocab in the model

* add test_tokenizer

* detokenize

* handle leading space

* format

ruff check --fix --select I *.py
ruff format *.py

* WIP fix tests

* add tracy profiling

* sort cpp imports

* CLI & bug fix

* fmt & revert unnecessary changes

* rm remove_head_row & get_first_cols_by_rows

* cli: use fairseq2_spm_detokenize

* use more fine grain context for beam search

lower memory usage from xGB to <0.5GB

* handle several input files

---------

Co-authored-by: cndn <373515162@qq.com>
Guillaume Wenzek 1 rok pred
rodič
commit
a768cdf55f

+ 1 - 1
.gitignore

@@ -146,4 +146,4 @@ seamless_communication
 # ignore src/seamless_communication  
 !*/seamless_communication
 m4t_scripts
-/ggml/sample_input.npz
+/ggml/test_data/

+ 3 - 0
.gitmodules

@@ -0,0 +1,3 @@
+[submodule "ggml/tracy"]
+	path = ggml/tracy
+	url = git@github.com:wolfpld/tracy.git

+ 4 - 0
ggml/CMakeLists.txt

@@ -160,6 +160,10 @@ target_include_directories(kaldi-native-fbank PUBLIC
   ${CMAKE_CURRENT_SOURCE_DIR}/examples/kaldi-native-fbank/csrc
 )
 
+option( TRACY_ENABLE "" ON)
+option( TRACY_ON_DEMAND "" ON)
+add_subdirectory (tracy)
+
 if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
     set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
     set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "RelWithDebInfo")

BIN
ggml/LJ037-0171_sr16k_test.wav


+ 40 - 3
ggml/Makefile

@@ -1,10 +1,47 @@
-build: build/src/libggml.so
+build: build/src/libggml.so ggml/build/bin/unity
 
-build/src/libggml.so: examples/unity/*.h examples/unity/*.cpp
+build/src/libggml.so: Makefile examples/unity/*.h examples/unity/*.cpp src/ggml*.c
 	mkdir -p build
-	cd build; cmake -DBUILD_SHARED_LIBS=On -DCMAKE_BUILD_TYPE=Debug ..
+	cd build; cmake\
+		-DGGML_OPENBLAS=ON \
+	  -DBUILD_SHARED_LIBS=On \
+	  -DCMAKE_BUILD_TYPE=Release \
+	  -DCMAKE_CXX_FLAGS="-g2 -fno-omit-frame-pointer" \
+	  -DTRACY_ENABLE=ON \
+	  ..
 	cd build; make -j4 fairseq2_cpp
 	find build/ -iname '*.so'
 
+
+ggml/build/bin/unity: Makefile examples/unity/*.h examples/unity/*.cpp src/ggml*.c
+	mkdir -p build
+	cd build; cmake\
+		-DGGML_OPENBLAS=ON \
+	  -DBUILD_SHARED_LIBS=On \
+	  -DCMAKE_BUILD_TYPE=Release \
+	  -DCMAKE_CXX_FLAGS="-g2 -fno-omit-frame-pointer" \
+	  -DTRACY_ENABLE=ON \
+	  ..
+	cd build; make -j4 unity
+	find build/ -iname '*.so'
+
+
 tests: build/src/libggml.so
 	pytest ./*.py -s
+
+build/src/libggml_cuda.so: Makefile examples/unity/*.h examples/unity/*.cpp
+	mkdir -p build
+	cd build; cmake\
+	  -DGGML_CUBLAS=ON \
+	  -DBUILD_SHARED_LIBS=On \
+	  -DCMAKE_BUILD_TYPE=Release \
+	  -DCMAKE_CXX_FLAGS="-g2" \
+	  ..
+	cd build; make -j4 ggml
+	mv build/src/libggml.so build/src/libggml_cuda.so
+	find build/ -iname '*.so'
+
+cuda_tests: build/src/libggml_cuda.so
+	sed -i 's/lib_base_name = "ggml"/lib_base_name = "ggml_cuda"/' third_party_ggml.py
+	pytest ./*.py -s
+	sed -i 's/lib_base_name = "ggml_cuda"/lib_base_name = "ggml"/' third_party_ggml.py

+ 3 - 1
ggml/ctypes_utils.py

@@ -55,8 +55,10 @@ def _py_type_to_ctype(t: type) -> type:
         return ctypes.c_float
     if t is bool:
         return ctypes.c_bool
-    if t is str:
+    if t is bytes:
         return ctypes.c_char_p
+    if t is str:
+        raise ValueError("str type is't supported by ctypes ?")
 
     if getattr(t, "__origin__", None) is Ptr:
         pointee = _py_type_to_ctype(t.__args__[0])  # type: ignore

+ 6 - 0
ggml/examples/common.h

@@ -37,10 +37,16 @@ struct gpt_params {
     int32_t n_gpu_layers     = 0;
 };
 
+bool unity_params_parse(int argc, char ** argv, unity_params & params);
+
 bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
 
+void unity_print_usage(int /*argc*/, char ** argv, const unity_params & params);
+
 void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
 
+
+
 std::string gpt_random_prompt(std::mt19937 & rng);
 
 //

+ 11 - 2
ggml/examples/unity/CMakeLists.txt

@@ -1,5 +1,4 @@
-# fairseq2_cpp
-
+# unity
 add_library(fairseq2_cpp)
 target_include_directories(fairseq2_cpp PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
 target_link_libraries(fairseq2_cpp PRIVATE ggml kaldi-native-fbank)
@@ -8,3 +7,13 @@ target_sources(fairseq2_cpp
         fairseq2.cpp
         model_loader.cpp
 )
+add_executable(unity unity.cpp)
+find_package(PkgConfig REQUIRED)
+pkg_check_modules(SNDFILE REQUIRED sndfile)
+target_include_directories(unity PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../.. ${SNDFILE_INCLUDE_DIRS})
+target_link_libraries(unity PRIVATE ggml ${SNDFILE_LIBRARIES})
+target_sources(unity
+    PRIVATE
+        fairseq2.cpp
+        model_loader.cpp
+)

+ 403 - 152
ggml/examples/unity/fairseq2.cpp

@@ -1,18 +1,41 @@
+#include <algorithm>
+#include <fnmatch.h>
+#include <iostream>
 #include <math.h>
+#include <queue>
+#include <unordered_map>
+
 #include "kaldi-native-fbank/csrc/feature-fbank.h"
 #include "kaldi-native-fbank/csrc/feature-window.h"
-#include "ggml.h"
+#include "tracy/Tracy.hpp"
+
 #include "fairseq2.h"
-#include <unordered_map>
-#include <algorithm>
-#include <iostream>
-#include <fnmatch.h>
+#include "ggml.h"
 
-void ggml_detach(ggml_tensor* a) {
+ggml_tensor* ggml_detach(ggml_tensor* a) {
     a->op = GGML_OP_NONE;
     std::fill(a->src, a->src + GGML_MAX_SRC, nullptr);
+    return a;
 }
 
+#define DEBUG_MEM_USAGE 1
+
+void printf_mem_usage(ggml_context* ctx, std::string name) {
+#if DEBUG_MEM_USAGE
+    double mb = 1024.0 * 1024.0;
+    printf(
+        "ctx %s: memory used = %8.2f MB, memory reserved = %8.2f Mb\n",
+        name.c_str(),
+        ggml_used_mem(ctx) / mb,
+        ggml_get_mem_size(ctx) / mb
+    );
+#endif
+}
+
+#define SWAP(x, y) \
+    auto tmp_ ## x = x; x = y; y = tmp_ ## x;
+
+
 /// allocate the fairseq2 model and hyperparameters
 extern "C" fairseq2_model* fairseq2_model_alloc() {
     // pre-allocate some memory to write hyperparameters and tensors pointers
@@ -25,8 +48,8 @@ extern "C" void fairseq2_kv_cache_alloc(const fairseq2_model& model, int beam_si
     // Note: we only allocate the cache for the decoder attention.
     // For encoder attention since we compute it all at once,
     // the allocation is delayed to the first forward pass, to not over allocate.
-    auto attn_glob = "*decoder.*_attn.k_proj.weight";
-    auto self_attn_glob = "*decoder.*self_attn.k_proj.weight";
+    auto attn_glob = "text_decoder.*_attn.k_proj.weight";
+    auto self_attn_glob = "text_decoder.*self_attn.k_proj.weight";
     ggml_tensor* self_attn_mask = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, max_seq_len, max_seq_len);
     self_attn_mask = ggml_diag_mask_inf_inplace(model.ctx, self_attn_mask, 0);
     ggml_format_name(self_attn_mask, "self_attn_mask[%d]", max_seq_len);
@@ -132,8 +155,7 @@ void _reorder_kv_cache(ggml_context* ctx, ggml_cgraph* gf, KeyValueTensor& kv, g
 }
 
 
-void reorder_kv_cache(const fairseq2_model& model, ggml_cgraph* gf, ggml_tensor* new_order) {
-    ggml_context* ctx = model.ctx;
+void reorder_kv_cache(const fairseq2_model& model, ggml_context* ctx, ggml_cgraph* gf, ggml_tensor* new_order) {
     for (auto& named_kv : model.kv_cache) {
         _reorder_kv_cache(ctx, gf, named_kv.second, new_order);
     }
@@ -176,6 +198,20 @@ bool has_layer(fairseq2_model& model, const std::string& name) {
     return model.tensors.find(name) != model.tensors.end();
 }
 
+ggml_tensor* mul_mat(ggml_context* ctx, ggml_tensor* a, ggml_tensor* b) {
+    if (b->ne[1] == 1 && b->ne[2] > 1 &&  a->n_dims == 2) {
+        // `b` has shape (B, 1, D).
+        // if `a` is (D_out, D), then we do one matmul for the full batch.
+        b = ggml_flatten_1d(ctx, b, 1);
+        return ggml_unflatten_1d(ctx, ggml_mul_mat(ctx, a, b), 1, 1);
+    }
+    // there is also the k * q matmul -> (D, 1, B) * (D, 1, B) -> (1, 1, B)
+    // not sure what's the best way to compute this with BLAS
+
+    return ggml_mul_mat(ctx, a, b);  // (d_out)
+}
+
+
 extern "C" ggml_tensor* Linear_forward(
     fairseq2_model& model,
     const std::string &prefix,
@@ -184,8 +220,7 @@ extern "C" ggml_tensor* Linear_forward(
     // Note: for now we assumed un-batched input
     ggml_tensor* weight = model.tensors[prefix + ".weight"];  // (d_in, d_out)
     GGML_ASSERT(weight != nullptr);
-    ggml_tensor* out = ggml_mul_mat(model.ctx, weight, input);  // (d_out)
-
+    ggml_tensor* out = mul_mat(model.ctx, weight, input);  // (d_out)
     ggml_tensor* bias = model.tensors[prefix + ".bias"];  // (d_out)
     if (bias == nullptr) return out;
 
@@ -358,12 +393,13 @@ extern "C" ggml_tensor* MultiheadAttention_forward(
             KeyValueTensor& kv_cache = model.kv_cache[prefix];
             if (kv_cache.step_nr == 0) {
                 k = Linear_forward(model, prefix + ".k_proj", keys);
-                ggml_format_name(k, "%s.k_cache", prefix.c_str());
                 v = Linear_forward(model, prefix + ".v_proj", values);
-                ggml_format_name(v, "%s.v_cache", prefix.c_str());
                 // TODO: encoder_padding_mask
-                kv_cache.full_k = k;
-                kv_cache.full_v = v;
+                // Note we are only storing a pointer to the buffer, not the full graph
+                kv_cache.full_k = ggml_detach(ggml_dup_inplace(ctx, k));
+                ggml_format_name(kv_cache.full_k, "%s.k_cache", prefix.c_str());
+                kv_cache.full_v = ggml_detach(ggml_dup_inplace(ctx, v));
+                ggml_format_name(kv_cache.full_v, "%s.v_cache", prefix.c_str());
                 kv_cache.step_nr = keys->ne[1];
             } else {
                 k = kv_cache.full_k;
@@ -395,11 +431,11 @@ extern "C" ggml_tensor* MultiheadAttention_forward(
     attn = ggml_permute(ctx, attn, 0, 2, 1, 3); // (B, S, H, H_dim)
 #else
     // (B * H, Sk, H_dim) x (B * H, S, H_dim) -> (B * H, S, Sk)
-    ggml_tensor* qk = ggml_mul_mat(ctx, k, q);
+    ggml_tensor* qk = mul_mat(ctx, k, q);
     ggml_set_name(qk, "qk");
     ggml_tensor* qk_scale = ggml_new_tensor_1d(ctx, qk->type, 1);
     ggml_set_f32(qk_scale, 1.0f/sqrtf(float(head_dim)));
-    qk = ggml_scale(ctx, qk, qk_scale);
+    qk = ggml_scale_inplace(ctx, qk, qk_scale);
     ggml_set_name(qk, "qk_scaled");
 
     // TODO: Should we replace this by ggml_diag_mask_inf ?
@@ -409,7 +445,7 @@ extern "C" ggml_tensor* MultiheadAttention_forward(
     ggml_set_name(attn_weights, "attn_weights");
 
     // (B * H, S, Sk) x (B * H, H_dim, Sk) -> (B * H, H_dim, S)
-    ggml_tensor* attn = ggml_mul_mat(ctx, attn_weights, v);
+    ggml_tensor* attn = mul_mat(ctx, attn_weights, v);
     ggml_set_name(attn, "attn");
     attn = ggml_unflatten_1d(ctx, attn, 2, num_heads);  // (B, H, H_dim, S)
     attn = ggml_permute(ctx, attn, 2, 0, 1, 3); // (B, S, H, H_dim)
@@ -452,7 +488,7 @@ extern "C" ggml_tensor* StandardTransformerEncoderLayer_forward(
     if (has_layer(model, prefix + ".self_attn_norm"))
         seqs = LayerNorm_forward(model, prefix + ".self_attn_norm", seqs);
 
-    seqs = ggml_add(ctx, seqs, residual);
+    seqs = ggml_add_inplace(ctx, seqs, residual);
 
     if (norm_order == TRANSFORMER_NORM_ORDER_POST)
         seqs =  LayerNorm_forward(model, prefix + ".self_attn_layer_norm", seqs);
@@ -468,7 +504,7 @@ extern "C" ggml_tensor* StandardTransformerEncoderLayer_forward(
     // TODO: if self.residual_scale is not None:
     // residual = self.residual_scale * residual
 
-    seqs = ggml_add(ctx, seqs, residual);
+    seqs = ggml_add_inplace(ctx, seqs, residual);
 
     if (norm_order == TRANSFORMER_NORM_ORDER_POST)
         seqs = LayerNorm_forward(model, prefix + ".ffn_layer_norm", seqs);
@@ -496,7 +532,7 @@ extern "C" ggml_tensor* WaveformToFbank_forward(
 
     std::vector<float_t> signal_frame{};
     std::int32_t num_frames = knf::NumFrames(/*num_samples=*/waveform->ne[0], frame_opts);
-    struct ggml_tensor * output = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 80, num_frames);
+    ggml_tensor* output = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 80, num_frames);
     knf::FbankComputer native_(opts);
     knf::FeatureWindowFunction window_fn_(native_.GetFrameOptions());
 
@@ -520,7 +556,7 @@ extern "C" ggml_tensor* WaveformToFbank_forward(
     output = ggml_norm(ctx, output, 1e-5);
     output = ggml_dup(ctx, ggml_transpose(ctx, output));
     if (output->ne[1] % 2 == 1) {
-        struct ggml_tensor * remove_last = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, output->ne[1]-1);
+        ggml_tensor* remove_last = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, output->ne[1]-1);
         for (int i = 0; i < output->ne[1]-1; ++i) {
             ((int32_t *) remove_last->data)[i] = i;
         }
@@ -541,9 +577,9 @@ extern "C" ggml_tensor* RelativePositionMHA_forward(
     ggml_tensor* residual = seqs;
     seqs = LayerNorm_forward(model, prefix + "_layer_norm", seqs);
     // self_attn: qkv
-    struct ggml_tensor * Qcur = Linear_forward(model, prefix + ".q_proj", seqs);
-    struct ggml_tensor * Kcur = Linear_forward(model, prefix + ".k_proj", seqs);
-    struct ggml_tensor * Vcur = Linear_forward(model, prefix + ".v_proj", seqs);
+    ggml_tensor* Qcur = Linear_forward(model, prefix + ".q_proj", seqs);
+    ggml_tensor* Kcur = Linear_forward(model, prefix + ".k_proj", seqs);
+    ggml_tensor* Vcur = Linear_forward(model, prefix + ".v_proj", seqs);
 
     // self_attn: rel_pos SDPA
     int32_t S = seqs->ne[1];
@@ -556,9 +592,7 @@ extern "C" ggml_tensor* RelativePositionMHA_forward(
 
     int num_indices = end_index - start_index;
 
-    struct ggml_tensor *rows = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, num_indices);
-    rows->data = malloc(ggml_nbytes(rows));
-
+    ggml_tensor* rows = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, num_indices);
     for (int i = 0; i < num_indices; i++) {
         ((int32_t *)rows->data)[i] = start_index + i;
     }
@@ -566,82 +600,77 @@ extern "C" ggml_tensor* RelativePositionMHA_forward(
     // self_attn: load pos_enc weights & compute_r
     // In fairseq2 pos_enc weights are calculated on the fly, since some more custom operators might be needed to enable this,
     // we store the results (fixed) in checkpoint as model.audio_enc_pos_enc_w and load directly.
-    struct ggml_tensor * r = ggml_get_rows(ctx, model.tensors["speech_encoder.pos_enc"], rows);
-    r = ggml_mul_mat(ctx, model.tensors[prefix + ".sdpa.r_proj.weight"], r);
+    ggml_tensor* r = ggml_get_rows(ctx, model.tensors["speech_encoder.pos_enc"], rows);
+    r = mul_mat(ctx, model.tensors[prefix + ".sdpa.r_proj.weight"], r);
     r = ggml_dup(ctx, ggml_permute(ctx,
                         ggml_cpy(ctx,
                             r,
                             ggml_new_tensor_3d(ctx, GGML_TYPE_F32, K_h, H, S*2-1)),
                         0, 2, 1, 3));
 
-    struct ggml_tensor * u_bias = ggml_reshape_3d(ctx, model.tensors[prefix + ".sdpa.u_bias"], K_h, 1, H);
-    struct ggml_tensor * v_bias = ggml_reshape_3d(ctx, model.tensors[prefix + ".sdpa.v_bias"], K_h, 1, H);
+    ggml_tensor* u_bias = ggml_reshape_3d(ctx, model.tensors[prefix + ".sdpa.u_bias"], K_h, 1, H);
+    ggml_tensor* v_bias = ggml_reshape_3d(ctx, model.tensors[prefix + ".sdpa.v_bias"], K_h, 1, H);
 
     // self_attn: Permute QKV
 
-    struct ggml_tensor * Q =
-                ggml_dup(ctx, ggml_permute(ctx,
+    ggml_tensor* Q = ggml_cont(ctx, ggml_permute(ctx,
                         ggml_cpy(ctx,
                             Qcur,
                             ggml_new_tensor_3d(ctx, GGML_TYPE_F32, K_h, H, S)),
                         0, 2, 1, 3)); // (H * K_h, S) -> (K_h, H, S) -> (K_h, S, H)
-    struct ggml_tensor * K =
-                ggml_dup(ctx, ggml_permute(ctx,
+    ggml_tensor* K = ggml_cont(ctx, ggml_permute(ctx,
                         ggml_cpy(ctx,
                             Kcur,
                             ggml_new_tensor_3d(ctx, GGML_TYPE_F32, K_h, H, S)),
                         0, 2, 1, 3)); // (H * K_h, S) -> (K_h, H, S) -> (K_h, S, H)
-    struct ggml_tensor * V =
-                ggml_dup(ctx, ggml_permute(ctx,
+    ggml_tensor* V = ggml_cont(ctx, ggml_permute(ctx,
                         ggml_cpy(ctx,
                             Vcur,
                             ggml_new_tensor_3d(ctx, GGML_TYPE_F32, K_h, H, S)),
                         1, 2, 0, 3)); // (H * K_h, S) -> (K_h, H, S) -> (H, S, K_h)
 
 
-    struct ggml_tensor * q_with_u_bias = ggml_add(ctx, Q, u_bias); // (K_h, S, H)
-    struct ggml_tensor * q_with_v_bias = ggml_add(ctx, Q, v_bias); // (K_h, S, H)
+    ggml_tensor* q_with_u_bias = ggml_add_inplace(ctx, ggml_dup(ctx, Q), u_bias); // (K_h, S, H)
+    ggml_tensor* q_with_v_bias = ggml_add_inplace(ctx, Q, v_bias); // (K_h, S, H)
 
-    struct ggml_tensor * ac = ggml_mul_mat(ctx, K, q_with_u_bias);
-    struct ggml_tensor * bd = ggml_mul_mat(ctx, r, q_with_v_bias);
+    ggml_tensor* ac = mul_mat(ctx, K, q_with_u_bias);
+    ggml_tensor* bd = mul_mat(ctx, r, q_with_v_bias);
 
 
     // self_attn: shift_bd. Logic follows https://github.com/facebookresearch/fairseq2/blob/main/src/fairseq2/nn/transformer/relative_attention.py#L161
     bd = ggml_dup(ctx, ggml_permute(ctx, bd, 2, 1, 0, 3)); // H, S, 2S-1
 
-    struct ggml_tensor * pad = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, H, S, 1);
-    pad->data = malloc(ggml_nbytes(pad));
-
+    ggml_tensor* pad = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, H, S, 1);
     pad = ggml_set_f32(pad, 0.0);
+
     bd = ggml_concat(ctx, pad, bd); // bd[i][j][0] == 0, (H, S, 2S)
     bd = ggml_dup(ctx, ggml_permute(ctx, bd, 2, 1, 0, 3)); // (2S, S, H)
-    bd = ggml_dup(ctx, ggml_reshape_3d(ctx, bd, S, 2*S, H));  // (S, 2S, H)
-    bd = ggml_remove_head_row(ctx, bd); // A custom operator introduced to reduce 1st row (in the 2nd dim)
-
-    bd = ggml_reshape_3d(ctx, bd, 2*S-1, S, H);
-
-    bd = ggml_get_first_cols_by_rows(ctx, bd); // A custom operator introduced to get first #rows cols.
-
+    bd = ggml_reshape_3d(ctx, bd, S, 2 * S, H);  // (S, 2S, H)
+    // discard the first set of positive positions
+    bd = ggml_dup(ctx, ggml_slice(ctx, bd, 1, 1, 2 * S));
+    // shifts each row by an extra step
+    bd = ggml_reshape_3d(ctx, bd, 2 * S - 1, S, H);
+    // Discard positions used for shift.
+    bd = ggml_slice(ctx, bd, 0, 0, S);
 
     // self_attn: compute attn / weights
-    struct ggml_tensor * attn_weights = ggml_add(ctx, ac, bd);
-    struct ggml_tensor * attn_scale = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, 1);
-    attn_scale->data = malloc(ggml_nbytes(attn_scale));
+    ggml_tensor* attn_weights = ggml_add_inplace(ctx, ac, bd);
+    ggml_tensor* attn_scale = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, 1);
     ggml_set_f32(attn_scale, 1.0 / pow(K_h, 0.5));
-    attn_weights = ggml_mul(ctx, ggml_repeat(ctx, attn_scale, attn_weights), attn_weights);
+    attn_weights = ggml_mul_inplace(ctx, attn_weights, ggml_repeat(ctx, attn_scale, attn_weights));
     attn_weights = ggml_soft_max(ctx, attn_weights);
 
-    struct ggml_tensor * attn = ggml_mul_mat(ctx, V, attn_weights); // K_h, S, H
+    ggml_tensor* attn = mul_mat(ctx, V, attn_weights); // K_h, S, H
     attn = ggml_dup(ctx, ggml_permute(ctx, attn, 0, 2, 1, 3));
-    struct ggml_tensor * attn_2d = ggml_reshape_2d(ctx, attn, K_h * H, S);
-
-    struct ggml_tensor * attn_out = ggml_mul_mat(ctx, model.tensors[prefix + ".output_proj.weight"], attn_2d);
-    attn_out = ggml_add(ctx,
-            ggml_repeat(ctx,
-                model.tensors[prefix + ".output_proj.bias"],
-                attn_out),
-            attn_out);
-    attn_out = ggml_add(ctx, residual, attn_out);
+    ggml_tensor* attn_2d = ggml_reshape_2d(ctx, attn, K_h * H, S);
+
+    ggml_tensor* attn_out = mul_mat(ctx, model.tensors[prefix + ".output_proj.weight"], attn_2d);
+    attn_out = ggml_add_inplace(
+        ctx,
+        attn_out,
+        ggml_repeat(ctx, model.tensors[prefix + ".output_proj.bias"], attn_out)
+    );
+    attn_out = ggml_add_inplace(ctx, attn_out, residual);
     return attn_out;
 }
 
@@ -654,7 +683,7 @@ extern "C" ggml_tensor* ConvModule_forward(
         ggml_tensor* residual = seqs;
         seqs = LayerNorm_forward(model, prefix + "_layer_norm", seqs);
         // conv: Use matmul for pointwise conv 1 - kernel_size=1, no padding case
-        seqs = ggml_mul_mat(ctx, model.tensors[prefix + ".pointwise_conv1.weight"], seqs);
+        seqs = mul_mat(ctx, model.tensors[prefix + ".pointwise_conv1.weight"], seqs);
 
         // conv: GLU
         seqs = ggml_glu(ctx, seqs);
@@ -667,14 +696,14 @@ extern "C" ggml_tensor* ConvModule_forward(
         seqs = ggml_batch_norm(ctx, seqs, model.tensors[prefix + ".batch_norm.weight"], model.tensors[prefix + ".batch_norm.bias"], model.tensors[prefix + ".batch_norm.running_mean"], model.tensors[prefix + ".batch_norm.running_var"], 1e-5);
 
         // conv: SiLU actvation
-        seqs = ggml_silu(ctx, seqs);
+        seqs = ggml_silu_inplace(ctx, seqs);
         seqs = ggml_dup(ctx, ggml_permute(ctx, seqs, 1, 0, 2, 3));
 
         // conv: Use matmul for pointwise conv 2 - kernel_size=1, no padding case
-        seqs = ggml_mul_mat(ctx, model.tensors[prefix + ".pointwise_conv2.weight"], seqs);
+        seqs = mul_mat(ctx, model.tensors[prefix + ".pointwise_conv2.weight"], seqs);
 
         // conv: + residual
-        seqs = ggml_add(ctx, seqs, residual);
+        seqs = ggml_add_inplace(ctx, seqs, residual);
         return seqs;
 }
 
@@ -685,21 +714,20 @@ extern "C" ggml_tensor* StandardConformerEncoderLayer_forward(
     ggml_tensor* padding_mask
 ) {
     ggml_context* ctx = model.ctx;
-    struct ggml_tensor * ffn_scale = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, 1);
-    ffn_scale->data = malloc(ggml_nbytes(ffn_scale));
+    ggml_tensor* ffn_scale = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, 1);
     ggml_set_f32(ffn_scale, 0.5f);
-    struct ggml_tensor * residual = seqs;
+    ggml_tensor* residual = seqs;
     seqs = LayerNorm_forward(model, prefix + ".ffn1_layer_norm", seqs);
     seqs = SiluFeedForwardNetwork_forward(model, prefix + ".ffn1", seqs);
-    seqs = ggml_mul(ctx, ggml_repeat(ctx, ffn_scale, seqs), seqs);
-    seqs = ggml_add(ctx, seqs, residual);
+    seqs = ggml_mul_inplace(ctx, seqs, ggml_repeat(ctx, ffn_scale, seqs));
+    seqs = ggml_add_inplace(ctx, seqs, residual);
     seqs = RelativePositionMHA_forward(model, prefix + ".self_attn", seqs);
     seqs = ConvModule_forward(model, prefix + ".conv", seqs);
     residual = seqs;
     seqs = LayerNorm_forward(model, prefix + ".ffn2_layer_norm", seqs);
     seqs = SiluFeedForwardNetwork_forward(model, prefix + ".ffn2", seqs);
-    seqs = ggml_mul(ctx, ggml_repeat(ctx, ffn_scale, seqs), seqs);
-    seqs = ggml_add(ctx, seqs, residual);
+    seqs = ggml_mul_inplace(ctx, seqs, ggml_repeat(ctx, ffn_scale, seqs));
+    seqs = ggml_add_inplace(ctx, seqs, residual);
     seqs = LayerNorm_forward(model, prefix + ".layer_norm", seqs);
     return seqs;
 }
@@ -709,7 +737,7 @@ extern "C" ggml_tensor* StandardConformerEncoder_forward(
     const std::string& prefix,
     ggml_tensor* seqs,
     ggml_tensor* padding_mask
-) { // TODO: Implement this!
+) {
     ggml_context* ctx = model.ctx;
     seqs = WaveformToFbank_forward(model, prefix, seqs);
     seqs = LayerNorm_forward(model, prefix + "_frontend.post_extract_layer_norm", seqs);
@@ -732,11 +760,10 @@ extern "C" ggml_tensor* StandardConformerEncoder_forward(
     seqs = Linear_forward(model, prefix + ".proj1", seqs);
     seqs = ggml_relu_inplace(ctx, seqs);
     seqs = Linear_forward(model, prefix + ".proj2", seqs);
-    struct ggml_tensor * ffn_scale = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, 1);
-    ffn_scale->data = malloc(ggml_nbytes(ffn_scale));
+    ggml_tensor* ffn_scale = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, 1);
     ggml_set_f32(ffn_scale, 0.5f);
     seqs = ggml_mul(ctx, ggml_repeat(ctx, ffn_scale, seqs), seqs);
-    seqs = ggml_add(ctx, seqs, residual);
+    seqs = ggml_add_inplace(ctx, seqs, residual);
     layer_idx = 0;
     layer_name = prefix + ".adaptor_layers." + std::to_string(layer_idx);
     while (has_layer(model, layer_name)) {
@@ -759,19 +786,19 @@ extern "C" ggml_tensor* StandardConformerEncoderAdaptorLayer_forward(
     ggml_tensor* padding_mask
 ) {
     ggml_context* ctx = model.ctx;
-    struct ggml_tensor * residual = seqs;
+    ggml_tensor* residual = seqs;
     residual = LayerNorm_forward(model, prefix + ".residual_layer_norm", residual);
     residual = ggml_dup(ctx, ggml_permute(ctx, residual, 1, 0, 2, 3));
     residual = ggml_conv_1d_generic(ctx, model.tensors[prefix + ".residual_conv.weight"], residual, 8, 4, 1);
     residual = ggml_dup(ctx, ggml_permute(ctx, residual, 1, 0, 2, 3));
-    residual = ggml_add(ctx, ggml_repeat(ctx, model.tensors[prefix + ".residual_conv.bias"], residual), residual);
+    residual = ggml_add_inplace(ctx, ggml_repeat(ctx, model.tensors[prefix + ".residual_conv.bias"], residual), residual);
     residual = ggml_glu(ctx, residual);
 
     seqs = LayerNorm_forward(model, prefix + ".self_attn_layer_norm", seqs);
     seqs = ggml_dup(ctx, ggml_permute(ctx, seqs, 1, 0, 2, 3));
     seqs = ggml_conv_1d_generic(ctx, model.tensors[prefix + ".self_attn_conv.weight"], seqs, 8, 4, 1);
     seqs = ggml_dup(ctx, ggml_permute(ctx, seqs, 1, 0, 2, 3));
-    seqs = ggml_add(ctx, ggml_repeat(ctx, model.tensors[prefix + ".self_attn_conv.bias"], seqs), seqs);
+    seqs = ggml_add_inplace(ctx, seqs, ggml_repeat(ctx, model.tensors[prefix + ".self_attn_conv.bias"], seqs));
     seqs = ggml_glu(ctx, seqs);
 
     seqs = MultiheadAttention_forward(
@@ -782,18 +809,18 @@ extern "C" ggml_tensor* StandardConformerEncoderAdaptorLayer_forward(
         seqs,
         /*attention masks=*/nullptr
     );
-    seqs = ggml_add(ctx, seqs, residual);
+    seqs = ggml_add_inplace(ctx, seqs, residual);
     residual = seqs;
     seqs = LayerNorm_forward(model, prefix + ".ffn_layer_norm", seqs);
     seqs = StandardFeedForwardNetwork_forward(model, prefix + ".ffn", seqs);
-    seqs = ggml_add(ctx, seqs, residual);
+    seqs = ggml_add_inplace(ctx, seqs, residual);
     return seqs;
 }
 
 
 /// ggml_slice(X, -1, start, end) is equivalent to X[start:end]
 /// ggml_slice(X, 0, start, end) is equivalent to X[..., start:end]
-struct ggml_tensor * ggml_slice(
+ggml_tensor* ggml_slice(
     struct ggml_context * ctx,
     struct ggml_tensor  * a,
     int axis,
@@ -804,9 +831,9 @@ struct ggml_tensor * ggml_slice(
     std::copy(a->ne, a->ne + 4, ne);
     if (axis < 0) axis = a->n_dims + axis;
     if (start < 0) start = ne[axis] + start;
-    if (end < 0) end = ne[axis] + end;
+    if (end <= 0) end = ne[axis] + end;
     GGML_ASSERT(0 <= start);
-    GGML_ASSERT(start <= end);
+    GGML_ASSERT(start < end);
     GGML_ASSERT(end <= ne[axis]);
 
 
@@ -820,7 +847,7 @@ struct ggml_tensor * ggml_slice(
     return result;
 }
 
-struct ggml_tensor * ggml_select(
+ggml_tensor* ggml_select(
     struct ggml_context * ctx,
     struct ggml_tensor  * a,
     int axis,
@@ -846,6 +873,7 @@ struct ggml_tensor * ggml_select(
 }
 
 
+// Inplace computation of PositionalEmbedding
 extern "C" ggml_tensor* PositionalEmbedding_forward(
     fairseq2_model& model,
     const std::string& prefix,
@@ -955,7 +983,7 @@ extern "C" ggml_tensor* StandardTransformerDecoderLayer_forward(
     if (has_layer(model, prefix + ".self_attn_norm"))
         seqs = LayerNorm_forward(model, prefix + ".self_attn_norm", seqs);
 
-    seqs = ggml_add(ctx, seqs, residual);
+    seqs = ggml_add_inplace(ctx, seqs, residual);
 
     if (norm_order == TRANSFORMER_NORM_ORDER_POST)
         seqs =  LayerNorm_forward(model, prefix + ".self_attn_layer_norm", seqs);
@@ -985,7 +1013,7 @@ extern "C" ggml_tensor* StandardTransformerDecoderLayer_forward(
         /*attention masks=*/encoder_padding_mask
     );
 
-    seqs = ggml_add(ctx, seqs, residual);
+    seqs = ggml_add_inplace(ctx, seqs, residual);
 
     if (norm_order == TRANSFORMER_NORM_ORDER_POST)
         seqs =  LayerNorm_forward(model, prefix + ".encoder_decoder_attn_layer_norm", seqs);
@@ -1002,7 +1030,7 @@ extern "C" ggml_tensor* StandardTransformerDecoderLayer_forward(
     // if self.residual_scale is not None:
     // residual = self.residual_scale * residual
 
-    seqs = ggml_add(ctx, seqs, residual);
+    seqs = ggml_add_inplace(ctx, seqs, residual);
 
     if (norm_order == TRANSFORMER_NORM_ORDER_POST)
         seqs = LayerNorm_forward(model, prefix + ".ffn_layer_norm", seqs);
@@ -1119,6 +1147,7 @@ extern "C" void _bootstrap_seqs_and_scores(
     ggml_tensor* encoder_output,
     ggml_tensor* encoder_padding_mask
 ) {
+    ZoneScoped;
     int prefix_seq_len = job.prefix_seq->ne[0];
     int max_seq_len = scores->ne[0];
     int beam_size = scores->ne[1];
@@ -1181,6 +1210,7 @@ int topk(
     std::int64_t k,
     ggml_tensor* candidate_indices
 ) {
+    ZoneNamed(topk, true);
     // Take the best 2 x `beam_size` predictions. We'll choose the first
     // `beam_size` of these which don't predict EOS to continue with.
     // (N, 2 x B)
@@ -1196,6 +1226,46 @@ int topk(
     return K;
 }
 
+void _tweak_lprobs(const SequenceGeneratorJob& job, ggml_tensor* lprobs, int step_nr, int max_seq_len, std::size_t vocab_size) {
+    ZoneNamed(tweak_lprobs, true);
+    std::size_t beam_size = job.opts.beam_size;
+    std::size_t eos_idx = job.eos_idx;
+
+    // Do not allow EOS before reaching the minimum sequence length.
+    if (step_nr < job.opts.min_seq_len) {
+        // lprobs[:, :, self.eos_idx] = -INFINITY;
+        for (size_t i = 0; i < beam_size; ++i)
+            ggml_set_f32_1d(lprobs, vocab_size * i + eos_idx, -INFINITY);
+    }
+
+    // If we have reached the maximum length, force the last step to be EOS.
+    if (step_nr == max_seq_len - 2) {
+        // lprobs[:, :, : self.eos_idx]       = -torch.inf
+        // lprobs[:, :,   self.eos_idx + 1 :] = -torch.inf
+        for (size_t b = 0; b < beam_size; ++b) {
+            size_t t = 0;
+            for (t = 0; t < eos_idx; ++t)
+                ggml_set_f32_1d(lprobs, vocab_size * b + t, -INFINITY);
+            for (t = eos_idx + 1; t < vocab_size; ++t)
+                ggml_set_f32_1d(lprobs, vocab_size * b + t, -INFINITY);
+        }
+    }
+
+    // Never allow PAD.
+    std::size_t pad_idx = job.pad_idx;
+    for (size_t i = 0; i < beam_size; ++i)
+        ggml_set_f32_1d(lprobs, vocab_size * i + pad_idx, -INFINITY);
+
+    // Apply UNK penalty.
+    if (job.unk_idx >= 0 && job.opts.unk_penalty != 0) {
+        // lprobs[:, :, self.unk_idx] -= self.opts.unk_penalty
+        auto lprobs_raw = ggml_get_data_f32(lprobs);
+        for (size_t i = 0; i < beam_size; ++i)
+            lprobs_raw[vocab_size * i + job.unk_idx] -= job.opts.unk_penalty;
+    }
+}
+
+
 
 /// Copies the sequence and scores of a given candidate beam.
 void _finalize_hypothesis(
@@ -1209,6 +1279,7 @@ void _finalize_hypothesis(
     ggml_tensor* scores, // (beam_size, seq_len)
     Hypothesis* hypothesis
 ) {
+    ZoneNamed(_finalize_hypothesis, true);
     ggml_tensor* seq = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, step_nr + 2);
     hypothesis->seq = seq;
     ggml_tensor* step_scores = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, step_nr + 2);
@@ -1241,6 +1312,15 @@ void _finalize_hypothesis(
     (Type*)(ggml_new_tensor_1d(ctx, GGML_TYPE_I8, sizeof(Type) * n)->data);
 
 
+ggml_context* ctx_from_buffer(std::vector<uint8_t>& buffer) {
+    return ggml_init({
+        /*.mem_size   =*/ static_cast<int64_t>(buffer.capacity()),
+        /*.mem_buffer =*/ buffer.data(),
+        /*.no_alloc   =*/ false,
+    });
+}
+
+
 /// Generates a translation for a single sequence
 // TODO: clean ups
 // * replace manual tensor tweaking with ggml_set_*d (a ggml_set_slice could be useful)
@@ -1251,9 +1331,13 @@ extern "C" Hypothesis* generate_sequence(
     ggml_tensor* encoder_padding_mask,
     ggml_context* result_ctx
 ) {
-    ggml_context* ctx = model.ctx;
-    size_t eos_idx = job.eos_idx;
-    auto pad_idx = job.pad_idx;
+    ZoneScoped;
+    std::vector<uint8_t> local_bufs[3] = {
+        std::vector<uint8_t>(256 * 1024 * 1024),  // step_ctx
+        std::vector<uint8_t>(256 * 1024 * 1024),  // next_step_ctx
+        std::vector<uint8_t>(256 * 1024 * 1024)  // search_ctx
+    };
+    ggml_context* search_ctx = ctx_from_buffer(local_bufs[2]);
 
     ggml_tensor* embed = model.tensors["text_decoder_frontend.embed.weight"];
     size_t vocab_size = embed->ne[1];
@@ -1261,10 +1345,12 @@ extern "C" Hypothesis* generate_sequence(
     int source_seq_len = encoder_output->ne[1];
     int max_seq_len = _determine_max_seq_len(job, source_seq_len);
 
+    ggml_context* original_ctx = model.ctx;
+    model.ctx = search_ctx;
     fairseq2_kv_cache_alloc(model, beam_size, max_seq_len);
 
     // (S_enc, M) -> (B, S_enc, M)
-    _fan_out_encoder_output(ctx, &encoder_output, &encoder_padding_mask, beam_size);
+    _fan_out_encoder_output(search_ctx, &encoder_output, &encoder_padding_mask, beam_size);
 
     // Allocate results in the context provided by the caller.
     Hypothesis* finished_searches_begin = GGML_CTX_ALLOC(result_ctx, Hypothesis, beam_size);
@@ -1273,10 +1359,10 @@ extern "C" Hypothesis* generate_sequence(
     Hypothesis* finished_searches_end = finished_searches + beam_size;
 
     // Initialize buffers. (B, S)
-    ggml_tensor* seqs = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, max_seq_len, beam_size);
+    ggml_tensor* seqs = ggml_new_tensor_2d(search_ctx, GGML_TYPE_I32, max_seq_len, beam_size);
     ggml_set_i32(seqs, 0);
     ggml_set_name(seqs, "seqs_0");
-    ggml_tensor* scores = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, max_seq_len, beam_size);
+    ggml_tensor* scores = ggml_new_tensor_2d(search_ctx, GGML_TYPE_F32, max_seq_len, beam_size);
     ggml_set_name(scores, "scores_0");
     ggml_set_f32(scores, 0.0);
 
@@ -1288,18 +1374,22 @@ extern "C" Hypothesis* generate_sequence(
 
     // Holds the indices of beams (a beam can occur more than once) that we
     // should continue with in the next step.
-    ggml_tensor* beam_indices = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, beam_size);
-    ggml_tensor* next_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, beam_size);
-    ggml_tensor* next_scores = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, beam_size);
+    ggml_tensor* beam_indices = ggml_new_tensor_1d(search_ctx, GGML_TYPE_I32, beam_size);
+    ggml_tensor* next_tokens = ggml_new_tensor_1d(search_ctx, GGML_TYPE_I32, beam_size);
+    ggml_tensor* next_scores = ggml_new_tensor_1d(search_ctx, GGML_TYPE_F32, beam_size);
 
     // Array with integers up to 'vocab_size * beam_size' to represent next beams to explore
-    ggml_tensor* candidate_indices = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, vocab_size * beam_size);
+    ggml_tensor* candidate_indices = ggml_new_tensor_1d(search_ctx, GGML_TYPE_I32, vocab_size * beam_size);
     for (std::size_t i = 0; i < vocab_size * beam_size; ++i)
         ((int32_t *)(candidate_indices->data))[i] = i;
 
-    // TODO: memory management, there should be a per-step ggml_context for intermediary results
+    printf_mem_usage(search_ctx, "search_ctx");
+
+    ggml_context* step_ctx = ctx_from_buffer(local_bufs[0]);
+    ggml_context* next_step_ctx = nullptr;
     for (int step_nr = start_step; step_nr < max_seq_len - 1; ++step_nr) {
-        ggml_tensor* prev_token = ggml_slice(ctx, seqs, 0, step_nr, step_nr + 1);
+        model.ctx = step_ctx;
+        ggml_tensor* prev_token = ggml_slice(step_ctx, seqs, 0, step_nr, step_nr + 1);
         ggml_tensor* decoder_input = TransformerEmbeddingFrontend_forward(model, "text_decoder_frontend", prev_token);
         ggml_tensor* decoder_output = StandardTransformerDecoder_forward(
             model,
@@ -1311,67 +1401,37 @@ extern "C" Hypothesis* generate_sequence(
         ); // (B, 1, D)
 
         // Just look at the last token.
-        decoder_output = ggml_flatten_1d(ctx, decoder_output, 0);  // (B, model_dim)
+        decoder_output = ggml_flatten_1d(step_ctx, decoder_output, 0);  // (B, model_dim)
         ggml_tensor* logits = Linear_forward(model, "final_proj", decoder_output);  // (B, vocab_size)
-        ggml_tensor* lprobs = ggml_log_softmax(ctx, logits);
+        ggml_tensor* lprobs = ggml_log_softmax(step_ctx, logits);
 
         // Compute lprobs here so we can modify it in place in the lprob tweaking phase
         // TODO: use ggml properly compute the tweaks
         ggml_cgraph gf = ggml_build_forward(lprobs);
-        printf("beam search step %d. Graph.n_nodes: %d\n", step_nr, gf.n_nodes);
-        ggml_graph_compute_with_ctx(ctx, &gf, 1);
+        // printf("beam search step %d. Graph.n_nodes: %d\n", step_nr, gf.n_nodes);
+        ggml_graph_compute_with_ctx(step_ctx, &gf, 1);
         ggml_detach(lprobs);
 
-        // // Do not allow EOS before reaching the minimum sequence length.
-        if (step_nr < job.opts.min_seq_len) {
-            // lprobs[:, :, self.eos_idx] = -INFINITY;
-            for (size_t i = 0; i < beam_size; ++i)
-                ggml_set_f32_1d(lprobs, vocab_size * i + eos_idx, -INFINITY);
-        }
-
-        // If we have reached the maximum length, force the last step to be EOS.
-        if (step_nr == max_seq_len - 2) {
-            // lprobs[:, :, : self.eos_idx]       = -torch.inf
-            // lprobs[:, :,   self.eos_idx + 1 :] = -torch.inf
-            for (size_t b = 0; b < beam_size; ++b) {
-                size_t t = 0;
-                for (t = 0; t < eos_idx; ++t)
-                    ggml_set_f32_1d(lprobs, vocab_size * b + t, -INFINITY);
-                for (t = eos_idx + 1; t < vocab_size; ++t)
-                    ggml_set_f32_1d(lprobs, vocab_size * b + t, -INFINITY);
-            }
-        }
-
-        // Never allow PAD.
-        for (size_t i = 0; i < beam_size; ++i)
-            ggml_set_f32_1d(lprobs, vocab_size * i + pad_idx, -INFINITY);
-
-        // Apply UNK penalty.
-        if (job.unk_idx >= 0 && job.opts.unk_penalty != 0) {
-            // lprobs[:, :, self.unk_idx] -= self.opts.unk_penalty
-            auto lprobs_raw = ggml_get_data_f32(lprobs);
-            for (size_t i = 0; i < beam_size; ++i)
-                lprobs_raw[vocab_size * i + job.unk_idx] -= job.opts.unk_penalty;
-        }
+        _tweak_lprobs(job, lprobs, step_nr, max_seq_len, vocab_size);
 
-        ggml_tensor* last_scores = ggml_slice(ctx, scores, 0, step_nr, step_nr+1);
+        ggml_tensor* last_scores = ggml_slice(step_ctx, scores, 0, step_nr, step_nr+1);
         if (step_nr == start_step) {
             // At the initial step, all hypotheses are equally likely, so we use
             // only the first beam.
-            lprobs = ggml_slice(ctx, lprobs, 1, 0, 1);
-            lprobs = ggml_cont(ctx, lprobs);
+            lprobs = ggml_slice(step_ctx, lprobs, 1, 0, 1);
+            lprobs = ggml_cont(step_ctx, lprobs);
             // The first step always indicates the beginning of the sequence and has no score.
             if (step_nr > 0) {
-                last_scores = ggml_slice(ctx, last_scores, 1, 0, 1);
-                lprobs = ggml_add_inplace(ctx, lprobs, ggml_repeat(ctx, last_scores, lprobs));
+                last_scores = ggml_slice(step_ctx, last_scores, 1, 0, 1);
+                lprobs = ggml_add_inplace(step_ctx, lprobs, ggml_repeat(step_ctx, last_scores, lprobs));
             }
         } else {
             // Make probabilities contain cumulative scores for each hypothesis.
-            lprobs = ggml_add(ctx, lprobs, ggml_repeat(ctx, last_scores, lprobs));
+            lprobs = ggml_add_inplace(step_ctx, lprobs, ggml_repeat(step_ctx, last_scores, lprobs));
         }
 
         gf = ggml_build_forward(lprobs);
-        ggml_graph_compute_with_ctx(ctx, &gf, 1);
+        ggml_graph_compute_with_ctx(step_ctx, &gf, 1);
 
         // Determine (beam, token) candidates for the next step.
         // (N, 2 x B)
@@ -1381,6 +1441,7 @@ extern "C" Hypothesis* generate_sequence(
 
         std::size_t ongoing_beams = 0;
         for (std::int32_t i = 0; i < K; ++i) {
+            ZoneNamed(beam_search_step, true);
             int c = ggml_get_f32_1d(candidate_indices, i);
             std::int32_t beam = c / vocab_size;
             std::int32_t token = c % vocab_size;
@@ -1411,16 +1472,19 @@ extern "C" Hypothesis* generate_sequence(
             // (B, S), (B) -> (B, S)
             // ggml_get_rows and ggml_set only work with floats ...
             new_seqs->type = GGML_TYPE_F32;
-            new_seqs = ggml_get_rows(ctx, seqs, beam_indices);
-            new_scores = ggml_get_rows(ctx, scores, beam_indices);
+            new_seqs = ggml_get_rows(search_ctx, seqs, beam_indices);
+            new_scores = ggml_get_rows(search_ctx, scores, beam_indices);
             ggml_cgraph gf_reorder = ggml_build_forward(new_seqs);
             ggml_build_forward_expand(&gf_reorder, new_scores);
-            reorder_kv_cache(model, &gf_reorder, beam_indices);
+            next_step_ctx = ctx_from_buffer(local_bufs[(step_nr + 1) % 2]);
+            reorder_kv_cache(model, next_step_ctx, &gf_reorder, beam_indices);
 
-            ggml_graph_compute_with_ctx(ctx, &gf_reorder, 1);
+            ggml_graph_compute_with_ctx(next_step_ctx, &gf_reorder, 1);
             ggml_detach(new_seqs);
             ggml_detach(new_scores);
             new_seqs->type = GGML_TYPE_I32;
+            printf_mem_usage(search_ctx, "search_ctx");
+            SWAP(step_ctx, next_step_ctx);
         }
 
         // new_seqs[:, step_nr + 1] = next_tokens
@@ -1433,6 +1497,7 @@ extern "C" Hypothesis* generate_sequence(
         // TODO the old seqs and score buffers could be reused for next step
         seqs = new_seqs;
         scores = new_scores;
+        printf_mem_usage(step_ctx, "step_ctx");
     }
 
 end_of_beam_search:
@@ -1444,6 +1509,7 @@ end_of_beam_search:
     );
 
     fairseq2_kv_cache_reset(model);
+    model.ctx = original_ctx;
     return finished_searches_begin;
 }
 
@@ -1458,3 +1524,188 @@ extern "C" Hypothesis* _testing_return_hypothesis_ptr(ggml_context* ctx) {
 
     return result;
 }
+
+// SPM tokenizer
+// original implementation:
+// https://github.com/ggerganov/llama.cpp/commit/074bea2eb1f1349a0118239c4152914aecaa1be4
+
+
+
+struct llm_symbol {
+    using index = int;
+    index prev;
+    index next;
+    const char * text;
+    size_t n;
+    llama_vocab::id id;
+};
+
+static_assert(std::is_trivially_copyable<llm_symbol>::value, "llm_symbol is not trivially copyable");
+
+static size_t utf8_len(char src) {
+    const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
+    uint8_t highbits = static_cast<uint8_t>(src) >> 4;
+    return lookup[highbits];
+}
+
+struct llm_bigram_spm {
+    struct comparator {
+        bool operator()(llm_bigram_spm & l, llm_bigram_spm & r) {
+            return (l.score < r.score) || (l.score == r.score && l.left > r.left);
+        }
+    };
+    using queue_storage = std::vector<llm_bigram_spm>;
+    using queue = std::priority_queue<llm_bigram_spm, queue_storage, comparator>;
+    llm_symbol::index left;
+    llm_symbol::index right;
+    float score;
+    size_t size;
+    llama_vocab::id id;
+};
+
+struct llm_tokenizer_spm {
+    llm_tokenizer_spm(const llama_vocab & vocab): vocab(vocab) {}
+
+    void tokenize(const std::string& input_text, ggml_tensor& output) {
+        llama_vocab::id unk_idx = vocab.token_to_id.at("<unk>");
+
+        // split string into utf8 chars
+        int index = 0;
+        size_t offs = 0;
+        // This is kind of annoying, but needed because with SPM,
+        // characters following a space have a special meaning.
+        // And the algorithm rely on substrings to do the lookups.
+        std::string text = input_text;
+        bool need_extra_space = text.size() > 0 && text[0] != ' ';
+        if (need_extra_space) text = " " + text;
+
+        while (offs < text.size()) {
+            size_t len = utf8_len(text[offs]);
+            size_t n = std::min(len, text.size() - offs);
+
+            auto token = vocab.token_to_id.find(std::string(text, offs, n));
+            llama_vocab::id id = token == vocab.token_to_id.end() ? unk_idx : token->second;
+            llm_symbol sym = {
+                /*prev*/ index - 1,
+                /*next*/ offs + n == text.size() ? -1 : index + 1,
+                /*text*/ text.c_str() + offs,
+                /*n*/ n,
+                /*id*/ id
+            };
+            offs += n;
+            index++;
+            symbols.emplace_back(sym);
+        }
+
+        // seed the work queue with all possible 2-character tokens.
+        for (size_t i = 1; i < symbols.size(); ++i) {
+            try_add_bigram(i - 1, i);
+        }
+
+        // keep substituting the highest frequency pairs for as long as we can.
+        while (!work_queue.empty()) {
+            auto bigram = work_queue.top();
+            work_queue.pop();
+
+            auto & left_sym = symbols[bigram.left];
+            auto & right_sym = symbols[bigram.right];
+            const std::string text = std::string(left_sym.text, left_sym.n + right_sym.n);
+
+            // if one of the symbols already got merged, skip it.
+            if (
+                left_sym.n == 0
+                || right_sym.n == 0
+                || left_sym.n + right_sym.n != bigram.size
+            ) continue;
+
+            // merge the right sym into the left one
+            left_sym.n += right_sym.n;
+            left_sym.id = bigram.id;
+            right_sym.n = 0;
+
+            // remove the right sym from the chain
+            left_sym.next = right_sym.next;
+            if (right_sym.next >= 0) {
+                symbols[right_sym.next].prev = bigram.left;
+            }
+
+            // find more substitutions
+            try_add_bigram(left_sym.prev, bigram.left);
+            try_add_bigram(bigram.left, left_sym.next);
+        }
+
+        llama_vocab::id* out = (llama_vocab::id*)output.data;
+        int out_step = sizeof(llama_vocab::id) / output.nb[0];
+        int num_tokens = 0;
+        for (int i = 0; i > -1; i = symbols[i].next) {
+            llm_symbol& symbol = symbols[i];
+            *(out + num_tokens * out_step) = symbol.id;
+            num_tokens += 1;
+        }
+        *(out + num_tokens * out_step) = vocab.token_to_id.at("</s>");
+        num_tokens += 1;
+        output.ne[0] = num_tokens;
+    }
+
+private:
+
+    void try_add_bigram(int left, int right) {
+        if (left == -1 || right == -1) {
+            return;
+        }
+
+        const std::string text = std::string(symbols[left].text, symbols[left].n + symbols[right].n);
+        auto token = vocab.token_to_id.find(text);
+
+        if (token == vocab.token_to_id.end()) {
+            return;
+        }
+
+        llama_vocab::id id = token->second;
+        if (static_cast<size_t>(id) >= vocab.id_to_token.size()) {
+            return;
+        }
+
+        const auto& tok_data = vocab.id_to_token[id];
+        llm_bigram_spm bigram = {
+            /*left */ left,
+            /*right*/ right,
+            /*score*/ tok_data.score,
+            /*size */ text.size(),
+            /*id */ id
+        };
+        work_queue.push(bigram);
+    }
+
+    const llama_vocab& vocab;
+    std::vector<llm_symbol> symbols;
+    llm_bigram_spm::queue work_queue;
+};
+
+
+extern "C" void fairseq2_spm_tokenize(fairseq2_model* model, const char* text, ggml_tensor& out) {
+    llm_tokenizer_spm spm = {model->vocab};
+    spm.tokenize(std::string(text), out);
+}
+
+extern "C" std::size_t fairseq2_spm_detokenize(fairseq2_model* model, ggml_tensor* tokens, char* out) {
+    int eos_idx = model->vocab.token_to_id["</s>"];
+    int sent_len = tokens->ne[0];
+    std::size_t written = 0;
+    for (int i = 0; i < sent_len; ++i) {
+        int id = ggml_get_i32_1d(tokens, i);
+        // Don't print the EOS token but only if it appear at the end.
+        if (i == sent_len - 1 && eos_idx == id) break;
+
+        std::string token = model->vocab.id_to_token.at(id).text;
+        // Skip the first space outputted.
+        auto begin = token.begin();
+        if (i == 0 && token.size() > 0 && token[0] == ' ') begin += 1;
+        std::copy(begin, token.end(), out);
+        std::size_t n = token.end() - begin;
+        written += n;
+        out += n;
+    }
+    *out = '0';
+    return written;
+}

+ 69 - 0
ggml/examples/unity/fairseq2.h

@@ -6,6 +6,68 @@
 #include "ggml.h"
 #include "kaldi-native-fbank/csrc/feature-fbank.h"
 
+typedef int32_t llama_token;
+
+extern "C" enum llama_token_type {
+    LLAMA_TOKEN_TYPE_UNDEFINED    = 0,
+    LLAMA_TOKEN_TYPE_NORMAL       = 1,
+    LLAMA_TOKEN_TYPE_UNKNOWN      = 2,
+    LLAMA_TOKEN_TYPE_CONTROL      = 3,
+    LLAMA_TOKEN_TYPE_USER_DEFINED = 4,
+    LLAMA_TOKEN_TYPE_UNUSED       = 5,
+    LLAMA_TOKEN_TYPE_BYTE         = 6,
+};
+
+
+struct llama_vocab {
+    using id    = int32_t;
+    using token = std::string;
+    using ttype = llama_token_type;
+
+    struct token_data {
+        token text;
+        float score;
+        ttype type;
+    };
+
+    std::unordered_map<token, id> token_to_id;
+    std::vector<token_data>       id_to_token;
+
+    std::unordered_map<token, id> special_tokens_cache;
+    std::map<std::pair<std::string, std::string>, int> bpe_ranks;
+
+    // default LLaMA special tokens
+    id special_bos_id = 1;
+    id special_eos_id = 2;
+    id special_unk_id = 0;
+    id special_sep_id = -1;
+    id special_pad_id = -1;
+
+    int special_add_bos = -1; // -1 unknown, 1 add, 0 don't add.
+    int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add.
+
+    id linefeed_id       = 13;
+    id special_prefix_id = 32007;
+    id special_middle_id = 32009;
+    id special_suffix_id = 32008;
+    id special_eot_id    = 32010;
+
+    int find_bpe_rank(std::string token_left, std::string token_right) const {
+        GGML_ASSERT(token_left.find(" ") == std::string::npos);
+        GGML_ASSERT(token_left.find("\n") == std::string::npos);
+        GGML_ASSERT(token_right.find(" ") == std::string::npos);
+        GGML_ASSERT(token_right.find("\n") == std::string::npos);
+
+        auto it = bpe_ranks.find(std::make_pair(token_left, token_right));
+        if (it == bpe_ranks.end()) {
+            return -1;
+        }
+
+        return it->second;
+    }
+};
+
+
 struct KeyValueTensor {
     ggml_tensor* full_k;
     ggml_tensor* full_v;
@@ -27,6 +89,8 @@ struct fairseq2_model {
     // Normally those can be inferred from hparams, but it avoids doing this logic in GGML
     std::unordered_map<std::string, std::int64_t> layer_config;
 
+    llama_vocab vocab;
+
     // KV cache for attention layers
     mutable std::unordered_map<std::string, KeyValueTensor> kv_cache;
 
@@ -42,6 +106,8 @@ extern "C" fairseq2_model* fairseq2_model_alloc();
 // free the models and all its owned tensors
 extern "C" void fairseq2_model_free(fairseq2_model* model);
 extern "C" void fairseq2_model_set_inference_ctx(fairseq2_model* model, ggml_context* ctx);
+extern "C" void fairseq2_kv_cache_reset(const fairseq2_model& model);
+ggml_context* ctx_from_buffer(std::vector<uint8_t>& buffer);
 
 extern "C" std::string* std_string_alloc(char* c_str);
 extern "C" void std_string_free(std::string* str);
@@ -233,3 +299,6 @@ extern "C" Hypothesis* generate_sequence(
     ggml_tensor* encoder_padding_mask,
     ggml_context* result_ctx
 );
+
+extern "C" void fairseq2_spm_tokenize(fairseq2_model* model, const char* text, ggml_tensor& out);
+extern "C" std::size_t fairseq2_spm_detokenize(fairseq2_model* model, ggml_tensor* tokens, char* out);

+ 41 - 2
ggml/examples/unity/model_loader.cpp

@@ -44,7 +44,7 @@ model_loader::load_model_weights(fairseq2_model &model, std::ifstream &fin)
     fin.read((char*) &ctx_size, sizeof(ctx_size));
 
     struct ggml_init_params params = {
-        /*.mem_size   =*/ static_cast<std::size_t>(ctx_size),
+        /*.mem_size   =*/ ctx_size,
         /*.mem_buffer =*/ NULL,
         /*.no_alloc   =*/ false,
     };
@@ -71,7 +71,7 @@ model_loader::load_model_weights(fairseq2_model &model, std::ifstream &fin)
     }
 
     double mb = 1024.0 * 1024.0;
-    printf("%s: model size  = %8.2f MB, memory used = %8.2f MB, memory reserved = %8.2f \n",
+    printf("%s: model size: %8.2f MB, memory used: %8.2f MB, memory reserved: %8.2f MB\n",
         __func__,
         model_size / mb,
         ggml_used_mem(model.tensors_ctx) / mb,
@@ -120,6 +120,44 @@ void model_loader::load_hparams(std::unordered_map<std::string, std::int64_t>& h
     }
 }
 
+void model_loader::load_vocab(llama_vocab& vocab, std::ifstream &fin)
+{
+    // vocab.special_bos_id = 1;
+    // vocab.special_eos_id = 2;
+    // vocab.special_unk_id = 0;
+    // vocab.special_sep_id = -1;
+    // vocab.special_pad_id = -1;
+
+    std::int64_t vocab_size = 0;
+    fin.read(reinterpret_cast<char*>(&vocab_size), sizeof(vocab_size));
+    GGML_ASSERT(fin.gcount() == 8);
+
+    vocab.token_to_id.reserve(vocab_size);
+    vocab.id_to_token.reserve(vocab_size);
+
+    std::string packed_vocab = get_name(fin);
+    std::int64_t ctx_size = vocab_size * sizeof(float) + vocab_size + 2 * ggml_tensor_overhead();
+    ggml_context* ctx = ggml_init(ggml_init_params{ctx_size, nullptr, false});
+    ggml_tensor* lengths_tensor = load_tensor_value(fin, ctx);
+    std::int8_t* lengths = (std::int8_t*)lengths_tensor->data;
+    ggml_tensor* scores_tensor = load_tensor_value(fin, ctx);
+    float* scores = ggml_get_data_f32(scores_tensor);
+
+    int64_t offset = 0;
+    for (int i = 0; i < vocab_size; ++i) {
+        // TODO: we should use string view instead of copying each word in a new string
+        std::string word = packed_vocab.substr(offset, lengths[i]);
+        vocab.token_to_id[word] = i;
+        vocab.id_to_token.push_back({word, scores[i], LLAMA_TOKEN_TYPE_NORMAL});
+        offset += lengths[i] + 1;
+    }
+    // Since we copied lengths and scores, we don't need the context anymore.
+    ggml_free(ctx);
+
+    // vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
+    // TODO: special tokens stuff ?
+}
+
 ggml_tensor* load_tensor_value(std::ifstream &fin, ggml_context* ctx)
 {
     int32_t n_dims = 0;
@@ -162,6 +200,7 @@ extern "C" int load_fairseq2_ggml_file(fairseq2_model& model, const char* fname)
     auto fin = open_ggml_file(fname);
     loader.load_hparams(model.hparams, fin);
     loader.load_hparams(model.layer_config, fin);
+    loader.load_vocab(model.vocab, fin);
     loader.load_model_weights(model, fin);
     return 0;
 }

+ 5 - 4
ggml/examples/unity/model_loader.h

@@ -6,16 +6,15 @@
 
 #pragma once
 
+#include <fstream>
+#include <iostream>
+#include <stdexcept>
 
 #include "ggml/ggml.h"
 #include "ggml/ggml-alloc.h"
 
-#include "common.h"
-#include "common-ggml.h"
 #include "fairseq2.h"
 
-#include <iostream>
-#include <stdexcept>
 
 class model_loader {
 public:
@@ -23,6 +22,8 @@ public:
 
     void load_hparams(std::unordered_map<std::string, std::int64_t>& hparams, std::ifstream &fin);
 
+    void load_vocab(llama_vocab& vocab, std::ifstream &fin);
+
 private:
     ggml_tensor * next_tensor(std::ifstream &fin, fairseq2_model &model);
 

+ 202 - 0
ggml/examples/unity/unity.cpp

@@ -0,0 +1,202 @@
+#include "ggml/ggml.h"
+#include "ggml/ggml-alloc.h"
+
+#include "math.h"
+#include "model_loader.h"
+#include "fairseq2.h"
+
+#include <thread>
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <map>
+#include <string>
+#include <vector>
+#include <iostream>
+#include <sndfile.h>
+#include <cstdlib>
+
+struct unity_params {
+    int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
+    std::string model      = "/private/home/dnn/unity.cpp_inc/seamless_communication/ggml/seamlessM4T_medium.ggml"; // model path
+    std::string tgt_lang = "eng";
+    std::vector<std::string> files = {};
+    bool text = false;
+    SequenceGeneratorOptions opts = {
+        /*beam_size*/ 5,
+        /*min_seq_len*/ 1,
+        /*soft_max_seq_len_a*/ 1,
+        /*soft_max_seq_len_b*/ 200,
+        /*hard_max_seq_len*/ 1000,
+        /*len_penalty*/ 1.0,
+        /*unk_penalty*/ 0.0,
+        /*normalize_scores*/ true,
+    };
+};
+
+
+void unity_print_usage(int /*argc*/, char ** argv, const unity_params & params) {
+    fprintf(stderr, "usage: %s [options] file1 file2 ...\n", argv[0]);
+    fprintf(stderr, "\n");
+    fprintf(stderr, "options:\n");
+    fprintf(stderr, "  -h, --help            show this help message and exit\n");
+    fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
+    fprintf(stderr, "  -m FNAME, --model FNAME\n");
+    fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
+    fprintf(stderr, "  --text                text output\n");
+    fprintf(stderr, "  --beam-size           beam size (default: %d)\n", params.opts.beam_size);
+    fprintf(stderr, "\n");
+}
+
+std::string get_next_arg(int& i, int argc, char** argv, const std::string& flag, unity_params& params) {
+    if (i + 1 < argc && argv[i + 1][0] != '-') {
+        return argv[++i];
+    } else {
+        fprintf(stderr, "error: %s requires one argument.\n", flag.c_str());
+        unity_print_usage(argc, argv, params);
+        exit(0);
+    }
+}
+
+
+bool unity_params_parse(int argc, char ** argv, unity_params & params) {
+    for (int i = 1; i < argc; i++) {
+        std::string arg = argv[i];
+        if (arg == "-h" || arg == "--help") {
+            unity_print_usage(argc, argv, params);
+        } else if (arg == "-t" || arg == "--threads") {
+            params.n_threads = std::stoi(get_next_arg(i, argc, argv, arg, params));
+        } else if (arg == "-m" || arg == "--model") {
+            params.model = get_next_arg(i, argc, argv, arg, params);
+        } else if (arg == "-l" || arg == "--tgt-lang") {
+            params.tgt_lang = get_next_arg(i, argc, argv, arg, params);
+        } else if (arg == "--text") {
+            params.text = true;
+        } else if (arg == "-b" || arg == "--beam-size") {
+            params.opts.beam_size = std::stoi(get_next_arg(i, argc, argv, arg, params));
+        } else {
+            params.files.push_back(std::string(arg));
+        }
+    }
+    return true;
+}
+
+struct ggml_cgraph * unity_speech_encoder(
+        fairseq2_model& model,
+        struct ggml_tensor * speech_input) {
+    ggml_context* ctx0 = model.ctx;
+    ggml_cgraph* gf = ggml_new_graph(ctx0);
+    ggml_tensor* seqs = StandardConformerEncoder_forward(model, "speech_encoder", speech_input, nullptr);
+    seqs = ggml_dup(model.ctx, seqs);
+    ggml_build_forward_expand(gf, seqs);
+    return gf;
+}
+
+
+Hypothesis* unity_decode(
+        fairseq2_model& model,
+        const SequenceGeneratorOptions& opts,
+        int tgt_lang_idx,
+        ggml_tensor* encoder_output,
+        int n_threads
+) {
+    SequenceGeneratorJob job = {
+        opts,
+        /*prefix_seq*/ nullptr,
+        /*pad_idx*/model.vocab.token_to_id["<pad>"],
+        /*unk_idx*/model.vocab.token_to_id["<unk>"],
+        /*bos_idx*/model.vocab.token_to_id["<s>"],
+        /*eos_idx*/model.vocab.token_to_id["</s>"],
+        /*num_threads*/n_threads,
+    };
+    struct ggml_tensor * prefix_seq = ggml_new_tensor_1d(model.ctx, GGML_TYPE_I32, 2);
+    ((int *)prefix_seq->data)[0]  = job.eos_idx;
+    ((int *)prefix_seq->data)[1]  = tgt_lang_idx;
+    job.prefix_seq = prefix_seq;
+    return generate_sequence(model, job, encoder_output, nullptr, model.ctx);
+}
+
+int main(int argc, char ** argv) {
+
+    unity_params params;
+
+    if (unity_params_parse(argc, argv, params) == false) {
+        return 1;
+    }
+
+    fairseq2_model model;
+
+    // load the model
+    if (load_fairseq2_ggml_file(model, params.model.c_str())) {
+        fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
+        return 1;
+    }
+
+    char result_str[4096];
+    static std::vector<uint8_t> encoder_buf(4 * 1024LL * 1024LL * 1024LL);
+
+    std::string input;
+    bool interactive = params.files.size() == 0;
+    auto next_file = params.files.begin();
+    while (true) {
+        if (interactive) {
+            std::cout << "\nEnter audio_path and tgt_lang, separated by space (or 'exit' to quit):\n";
+            std::getline(std::cin, input);
+            if (input == "exit") {
+                break;
+            }
+        } else {
+            if (next_file == params.files.end()) break;
+            input = *(next_file++);
+        }
+        std::istringstream iss(input);
+        std::string audio_path;
+        std::string tgt_lang = params.tgt_lang;
+        iss >> audio_path >> tgt_lang;
+        if (audio_path == "-") {
+            audio_path = "/proc/self/fd/0";
+        }
+        std::cerr << "Translating (Transcribing) " << audio_path << " to " << tgt_lang << "\n";
+        SF_INFO info;
+        SNDFILE* sndfile = sf_open(audio_path.c_str(), SFM_READ, &info);
+        if (!sndfile) {
+            std::cerr << "Could not open file\n";
+            if (interactive) continue;
+            else return 1;
+        }
+        auto tgt_lang_ptr = model.vocab.token_to_id.find("__" + tgt_lang + "__");
+        if (tgt_lang_ptr == model.vocab.token_to_id.end()) {
+            std::cerr << "Unknown language " << tgt_lang << "\n";
+            if (interactive) continue;
+            else return 2;
+        }
+        int tgt_lang_idx = tgt_lang_ptr->second;
+
+        // Load audio input
+        std::vector<float> data(info.frames * info.channels); // Assume info.channels is always 1
+        sf_readf_float(sndfile, data.data(), info.frames);
+
+        // Reset the ggml_context
+        model.ctx = ctx_from_buffer(encoder_buf);
+        ggml_tensor* seqs = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, info.frames, 1);
+        memcpy(seqs->data, data.data(), data.size() * sizeof(float));
+        // Audio encoder
+        ggml_cgraph* gf = unity_speech_encoder(model, seqs);
+        ggml_graph_compute_with_ctx(model.ctx, gf, params.n_threads);
+        ggml_tensor* encoder_output = gf->nodes[gf->n_nodes - 1];
+
+        // Beam search decoding
+        const Hypothesis* result = unity_decode(model, params.opts, tgt_lang_idx, encoder_output, params.n_threads);
+
+        // Drop language and bos token.
+        ggml_tensor* tokens = ggml_slice(model.ctx, result[0].seq, 0, 2, 0);
+
+        // Collect result string
+        int n = fairseq2_spm_detokenize(&model, tokens, (char*)&result_str);
+        std::cout << std::string((char*)&result_str, n) << std::endl;
+    }
+
+    return 0;
+}

+ 16 - 2
ggml/ggml.py

@@ -14,7 +14,7 @@ from typing import Any, Callable, Dict, Iterator, NamedTuple, Tuple, Type, Union
 import numpy as np
 import torch
 
-from ctypes_utils import Ptr, c_fn, c_struct
+from ctypes_utils import NULLPTR, Ptr, c_fn, c_struct
 from third_party_ggml import *
 
 ### Helpers
@@ -489,7 +489,7 @@ def _testing_return_hypothesis_ptr(ctx: ggml_context_p) -> Ptr[Hypothesis]:
 
 
 @c_fn(lib)
-def fairseq2_model_layer_config_int(model: ctypes.c_void_p, name: str) -> int:
+def fairseq2_model_layer_config_int(model: ctypes.c_void_p, name: bytes) -> int:
     return -1
 
 
@@ -514,3 +514,17 @@ def fairseq2_kv_cache_alloc(
         yield
     finally:
         _fairseq2_kv_cache_reset(model)
+
+
+@c_fn(lib)
+def fairseq2_spm_tokenize(
+    model: ctypes.c_void_p, text: bytes, out: Ptr[ggml_tensor]
+) -> None:
+    pass
+
+
+@c_fn(lib)
+def fairseq2_spm_detokenize(
+    model: ctypes.c_void_p, tensor: Ptr[ggml_tensor], out: ctypes.Array[ctypes.c_char]
+) -> ctypes.c_size_t:
+    return 0

+ 48 - 12
ggml/ggml_convert.py

@@ -18,7 +18,7 @@ from fairseq2.assets import AssetCard
 from fairseq2.models.transformer.frontend import TransformerEmbeddingFrontend
 from fairseq2.nn import SinusoidalPositionEncoder
 from fairseq2.nn.transformer import RelativePositionalEncoding
-from seamless_communication.models.unity import load_unity_config, load_unity_model
+from seamless_communication.models import unity
 
 import ggml
 
@@ -29,6 +29,7 @@ def convert_model(
     model_name: Union[str, torch.nn.Module],
     out: Optional[Path] = None,
     hparams: Optional[Dict[str, Any]] = None,
+    vocab: Optional[List[Tuple[str, float]]] = None,
 ) -> None:
     if isinstance(model_name, str):
         # Load the corresponding fairseq2 model
@@ -38,12 +39,15 @@ def convert_model(
         # The type of model depends on the name
         if "unity" in model_name or "seamlessM4T" in model_name:
             if hparams is None:
-                model_config = load_unity_config(model_name)
+                model_config = unity.load_unity_config(model_name)
                 hparams = flatten_config(
                     dataclasses.asdict(model_config), separator="__"
                 )
                 print(hparams)
-            model = load_unity_model(model_name)
+            model = unity.load_unity_model(model_name)
+            if vocab is None:
+                tokenizer = unity.load_unity_text_tokenizer(model_name)
+                vocab = read_vocab(tokenizer)
         else:
             raise ValueError(f"Unsupported model type: {model_name}")
     else:
@@ -57,9 +61,9 @@ def convert_model(
     state_dict = model.state_dict()
     fixup_model(model, state_dict)
     layer_config = read_layer_config(model)
+    vocab = vocab or []
 
-    with out.open("wb") as o:
-        write_ggml_file(o, hparams, layer_config, state_dict)
+    write_ggml_file(out, hparams, layer_config, vocab, state_dict)
 
 
 def _nested_getattr(model: Any, name: str) -> Any:
@@ -120,16 +124,28 @@ def fixup_model(model: torch.nn.Module, state_dict: Dict[str, torch.Tensor]) ->
         state_dict["speech_encoder.pos_enc"] = rel_pos_enc.freqs
 
 
+def read_vocab(tokenizer: Any) -> List[Tuple[str, float]]:
+    vocab_info = tokenizer.vocab_info
+    vocab = [
+        (tokenizer.model.index_to_token(i).replace("▁", " "), -i)
+        for i in range(vocab_info.size)
+    ]
+    return vocab  # type: ignore[return-value]
+
+
 def write_ggml_file(
-    out: BufferedWriter,
+    out: Path,
     hparams: Dict[str, Any],
     layer_config: Dict[str, Any],
+    vocab: List[Tuple[str, float]],
     state_dict: Dict[str, torch.Tensor],
 ) -> None:
-    write_ggml_header(out)
-    write_hparams(out, hparams)
-    write_hparams(out, layer_config)
-    write_state_dict(out, state_dict)
+    with out.open("wb") as o:
+        write_ggml_header(o)
+        write_hparams(o, hparams)
+        write_hparams(o, layer_config)
+        write_vocab(o, vocab)
+        write_state_dict(o, state_dict)
 
 
 def write_ggml_header(out: BufferedWriter) -> None:
@@ -162,6 +178,24 @@ def write_hparams(out: BufferedWriter, hparams: Dict[str, Any]) -> None:
     logging.info(f"Saved {len(simple_vals)} params.")
 
 
+def write_vocab(out: BufferedWriter, vocab: List[Tuple[str, float]]) -> None:
+    out.write(struct.pack("<q", len(vocab)))
+
+    # Write all words concatenated in a buffer
+    words = [bytes(w, "utf8") for w, score in vocab]
+    packed_words = b"\0".join(words)
+    # We use i32 to allow reusing the string loading codes
+    packed_len = struct.pack("<i", len(packed_words))
+    out.write(packed_len)
+    out.write(packed_words)
+
+    lengths = torch.tensor([len(w) for w in words], dtype=torch.int8)
+    write_tensor(out, lengths)
+
+    scores = torch.tensor([score for w, score in vocab], dtype=torch.float32)
+    write_tensor(out, scores)
+
+
 def write_state_dict(out: BufferedWriter, state_dict: Dict[str, torch.Tensor]) -> None:
     """Write pytorch state dict.
 
@@ -234,13 +268,15 @@ def write_tensor(out: BufferedWriter, value: torch.Tensor) -> None:
     data.tofile(out)
 
 
-def torch_to_ggml_type(dtype: type) -> int:
+def torch_to_ggml_type(dtype: torch.dtype) -> int:
     if dtype is torch.float32:
         return ggml.GGML_TYPE_F32
     elif dtype is torch.float16:
         return ggml.GGML_TYPE_F16
     elif dtype is torch.int32:
         return ggml.GGML_TYPE_I32
+    elif dtype is torch.int8:
+        return ggml.GGML_TYPE_I8
     else:
         raise NotImplementedError(f"{dtype} is not mapped to a GGML_TYPE")
 
@@ -293,7 +329,7 @@ def read_layer_config(model: torch.nn.Module) -> Dict[str, Any]:
             if k.startswith("_"):
                 continue
             # All modules have a "training" flag
-            if k == "training":
+            if k in ("training", "init_fn"):
                 continue
             if v is None:
                 continue

+ 2 - 12
ggml/include/ggml/ggml.h

@@ -363,8 +363,6 @@ extern "C" {
         GGML_OP_REPEAT,
         GGML_OP_REPEAT_BACK,
         GGML_OP_CONCAT,
-        GGML_OP_REMOVE_HEAD_ROW,
-        GGML_OP_GET_FIRST_COLS_BY_ROWS,
         GGML_OP_SILU_BACK,
         GGML_OP_NORM, // normalize
         GGML_OP_BATCH_NORM, 
@@ -562,7 +560,7 @@ extern "C" {
 
     struct ggml_init_params {
         // memory pool
-        size_t mem_size;   // bytes
+        int64_t mem_size;   // bytes
         void * mem_buffer; // if NULL, memory will be allocated internally
         bool   no_alloc;   // don't allocate memory for the tensor data
     };
@@ -645,7 +643,7 @@ extern "C" {
     GGML_API void    ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
 
     GGML_API void *  ggml_get_mem_buffer     (const struct ggml_context * ctx);
-    GGML_API size_t  ggml_get_mem_size       (const struct ggml_context * ctx);
+    GGML_API int64_t  ggml_get_mem_size       (const struct ggml_context * ctx);
     GGML_API size_t  ggml_get_max_tensor_size(const struct ggml_context * ctx);
 
     GGML_API struct ggml_tensor * ggml_new_tensor(
@@ -852,14 +850,6 @@ extern "C" {
             struct ggml_tensor  * a,
             struct ggml_tensor  * b);
 
-    GGML_API struct ggml_tensor * ggml_remove_head_row(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    GGML_API struct ggml_tensor * ggml_get_first_cols_by_rows(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
     GGML_API struct ggml_tensor * ggml_abs(
             struct ggml_context * ctx,
             struct ggml_tensor  * a);

+ 4 - 0
ggml/src/CMakeLists.txt

@@ -261,8 +261,12 @@ target_include_directories(${TARGET} PUBLIC
     ../include
     ../include/ggml
     ../examples/
+    ../tracy/public/
     ${GGML_EXTRA_INCS}
     )
+if (TRACY_ENABLE)
+    target_link_libraries (${TARGET} PUBLIC Tracy::TracyClient )
+endif()
 
 if (MSVC)
     target_link_libraries(${TARGET} PUBLIC ${GGML_EXTRA_LIBS} ${CMAKE_THREAD_LIBS_INIT} kaldi-native-fbank)

+ 213 - 165
ggml/src/ggml.c

@@ -25,6 +25,7 @@
 #include <limits.h>
 #include <stdarg.h>
 #include <signal.h>
+#include "tracy/TracyC.h"
 
 
 #ifdef GGML_USE_METAL
@@ -2333,7 +2334,9 @@ inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, co
 inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]/y[i];   }
 
 static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y) {
-#ifdef GGML_SIMD
+#if defined(GGML_USE_OPENBLAS)
+    float sumf = cblas_sdot(n, x, 1, y, 1);
+#elif defined(GGML_SIMD)
     float sumf = 0.0f;
     const int np = (n & ~(GGML_F32_STEP - 1));
 
@@ -3943,8 +3946,6 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "REPEAT",
     "REPEAT_BACK",
     "CONCAT",
-    "REMOVE_HEAD_ROW",
-    "GET_FIRST_COLS_BY_ROWS",
     "SILU_BACK",
     "NORM",
     "BATCH_NORM",
@@ -4014,7 +4015,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "CROSS_ENTROPY_LOSS_BACK",
 };
 
-// static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68"); // commented out for dev
+static_assert(GGML_OP_COUNT == 75, "GGML_OP_COUNT != 75");
 
 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "none",
@@ -4036,10 +4037,6 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "repeat(x)",
     "repeat_back(x)",
     "concat(x, y)",
-    "remove_head_row(x)",
-    "get_first_cols_by_rows(x)",
-    "remove_head_row(x)",
-    "get_first_cols_by_rows(x)",
     "silu_back(x)",
     "norm(x)",
     "batch_norm(x)",
@@ -4107,8 +4104,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "cross_entropy_loss_back(x,y)",
 };
 
-// static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
-// static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
+static_assert(GGML_OP_COUNT == 75, "GGML_OP_COUNT != 75");
 
 static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
 
@@ -4162,7 +4158,7 @@ static void ggml_setup_op_has_task_pass(void) {
 //
 
 struct ggml_context {
-    size_t mem_size;
+    int64_t mem_size;
     void * mem_buffer;
     bool   mem_buffer_owned;
     bool   no_alloc;
@@ -4699,7 +4695,7 @@ void * ggml_get_mem_buffer(const struct ggml_context * ctx) {
     return ctx->mem_buffer;
 }
 
-size_t ggml_get_mem_size(const struct ggml_context * ctx) {
+int64_t ggml_get_mem_size(const struct ggml_context * ctx) {
     return ctx->mem_size;
 }
 
@@ -5860,45 +5856,6 @@ struct ggml_tensor * ggml_concat(
     return result;
 }
 
-// ggml_remove_head_row
-
-struct ggml_tensor * ggml_remove_head_row(
-    struct ggml_context* ctx,
-    struct ggml_tensor* a) {
-    bool is_node = false;
-
-    if (a->grad) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, a->ne[0], a->ne[1]-1, a->ne[2], a->ne[3]);
-
-    result->op = GGML_OP_REMOVE_HEAD_ROW;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-// ggml_get_first_cols_by_rows
-
-struct ggml_tensor * ggml_get_first_cols_by_rows(
-    struct ggml_context* ctx,
-    struct ggml_tensor* a) {
-    bool is_node = false;
-
-    if (a->grad) {
-        is_node = true;
-    }
-    struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, a->ne[1], a->ne[1], a->ne[2], a->ne[3]);
-
-    result->op = GGML_OP_GET_FIRST_COLS_BY_ROWS;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
 // ggml_abs
 
 struct ggml_tensor * ggml_abs(
@@ -6253,6 +6210,23 @@ struct ggml_tensor * ggml_mul_mat(
     GGML_ASSERT(ggml_can_mul_mat(a, b));
     GGML_ASSERT(!ggml_is_transposed(a));
 
+#if defined(GGML_USE_OPENBLAS) && GGML_DEBUG
+
+    const int64_t i = a->ne[1];
+    const int64_t j = b->ne[1];
+    const int64_t k = a->ne[0]; // = b->ne[0]
+
+    bool big = (i >= 32 && j >= 32 && k >= 32);
+    big = big || (i >= 512 && k >= 512);
+
+    if (!big) {
+        printf("Not using Openblas for small matmul (%d, %d) @ (%d, %d) \n", i, k, j, k);
+    }
+    if (!ggml_is_contiguous(a) || !ggml_is_contiguous(b)) {
+        printf("Not using Openblas for matmul (%d, %d) @ (%d, %d) because of non contiguous\n", i, k, j, k);
+    }
+#endif
+
     bool is_node = false;
 
     if (a->grad || b->grad) {
@@ -9073,7 +9047,17 @@ static void ggml_compute_forward_add_f32(
 
 #ifdef GGML_USE_ACCELERATE
             vDSP_vadd(src0_ptr, 1, src1_ptr, 1, dst_ptr, 1, ne00);
-#else
+#elif GGML_USE_OPENBLAS
+            // In saxpy adds a*x to y.
+            if (dst_ptr == src0_ptr) {
+                cblas_saxpy(ne00, 1.0f, src1_ptr, 1, dst_ptr, 1);
+            } else if (dst_ptr == src1_ptr) {
+                cblas_saxpy(ne00, 1.0f, src0_ptr, 1, dst_ptr, 1);
+            } else {
+                // Fallback to manual loop.
+                ggml_vec_add_f32(ne00, dst_ptr, src0_ptr, src1_ptr);
+            }
+# else
             ggml_vec_add_f32(ne00, dst_ptr, src0_ptr, src1_ptr);
 #endif
                 // }
@@ -9094,11 +9078,24 @@ static void ggml_compute_forward_add_f32(
             float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
             float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
 
+#if GGML_USE_OPENBLAS
+            float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
+            if (dst_ptr == src0_ptr) {
+                cblas_saxpy(ne0, 1.0f, src1_ptr, nb10 / sizeof(float), dst_ptr, 1);
+                return;
+            } else if (dst_ptr == src1_ptr) {
+                cblas_saxpy(ne0, 1.0f, src0_ptr, 1, dst_ptr, nb10 / sizeof(float));
+                return;
+            } else {
+                // Fallback to manual loop.
+                abort();
+            }
+#else
             for (int i0 = 0; i0 < ne0; i0++) {
                 float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i0*nb10);
-
                 dst_ptr[i0] = src0_ptr[i0] + *src1_ptr;
             }
+#endif
         }
     }
 }
@@ -10534,99 +10531,6 @@ static void ggml_compute_forward_concat(
     }
 }
 
-// ggml_compute_forward_remove_head_row
-
-static void ggml_compute_forward_remove_head_row_f32(
-    const struct ggml_compute_params * params,
-    const struct ggml_tensor * src0,
-    struct ggml_tensor * dst) {
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    GGML_ASSERT(src0->nb[0] == sizeof(float));
-
-    GGML_TENSOR_UNARY_OP_LOCALS;
-
-    // TODO: support for transposed / permuted tensors
-    GGML_ASSERT(nb0  == sizeof(float));
-    GGML_ASSERT(nb00 == sizeof(float));
-    for (int i2 = 0; i2 < ne02; i2++) {
-        for (int i1 = 1; i1 < ne01; i1++) {
-            for (int i0 = 0; i0 < ne00; i0++) {
-                const float * x = (float *)((char *) src0->data + i0 * nb00 + i1 * nb01 + i2 * nb02);
-                float * y = (float *)((char *)dst->data + i0 * nb0 + (i1-1) * nb1 + i2 * nb2);
-                *y = *x;
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_remove_head_row(
-    const struct ggml_compute_params* params,
-    const struct ggml_tensor* src0,
-    struct ggml_tensor* dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_remove_head_row_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_get_first_cols_by_rows
-
-static void ggml_compute_forward_get_first_cols_by_rows_f32(
-    const struct ggml_compute_params * params,
-    const struct ggml_tensor * src0,
-    struct ggml_tensor * dst) {
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    GGML_ASSERT(src0->nb[0] == sizeof(float));
-
-    GGML_TENSOR_UNARY_OP_LOCALS;
-
-    // TODO: support for transposed / permuted tensors
-    GGML_ASSERT(nb0  == sizeof(float));
-    GGML_ASSERT(nb00 == sizeof(float));
-
-    for (int i3 = 0; i3 < ne3; i3++) {
-        for (int i2 = 0; i2 < ne2; i2++) {
-            for (int i1 = 0; i1 < ne1; i1++) {
-                for (int i0 = 0; i0 < ne1; i0++) {
-                    const float * x = (float *)((char *) src0->data + i0 * nb00 + i1 * nb01 + i2 * nb02 + i3 * nb03);
-                    float * y = (float *)((char *)dst->data + i0 * nb0 + i1 * nb1 + i2 * nb2 + i3 * nb3);
-                    *y = *x;
-                }
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_get_first_cols_by_rows(
-    const struct ggml_compute_params* params,
-    const struct ggml_tensor* src0,
-    struct ggml_tensor* dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_get_first_cols_by_rows_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
 // ggml_compute_forward_abs
 
 static void ggml_compute_forward_abs_f32(
@@ -11679,11 +11583,13 @@ static bool ggml_compute_forward_mul_mat_use_blas(
 
     // TODO: find the optimal values for these
     if (ggml_is_contiguous(src0) &&
-        ggml_is_contiguous(src1) &&
-        (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
+        ggml_is_contiguous(src1)) {
+
+        bool big = (ne0 >= 32 && ne1 >= 32 && ne10 >= 32);
+        big = big || (ne0 >= 512 && ne10 >= 512);
 
         /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
-        return true;
+        return big;
     }
 
     return false;
@@ -11786,11 +11692,17 @@ static void ggml_compute_forward_mul_mat(
                     x = wdata;
                 }
 
+                TracyCPlot("cblas_sgemm_B", ne13 * ne12);
+                TracyCPlot("cblas_sgemm_M", ne11);
+                TracyCPlot("cblas_sgemm_N", ne01);
+                TracyCPlot("cblas_sgemm_K", ne10);
+                TracyCZoneN(_tracy_sgemm, "cblas_sgemm", true);
                 cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
                         ne11, ne01, ne10,
                         1.0f,    y, ne10,
                                  x, ne00,
                         0.0f,    d, ne01);
+                TracyCZoneEnd(_tracy_sgemm);
             }
         }
 
@@ -16823,332 +16735,472 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
     switch (tensor->op) {
         case GGML_OP_DUP:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_DUP", true);
                 ggml_compute_forward_dup(params, tensor->src[0], tensor);
+                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_ADD:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_ADD", true);
                 ggml_compute_forward_add(params, tensor->src[0], tensor->src[1], tensor);
+                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_ADD1:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_ADD1", true);
                 ggml_compute_forward_add1(params, tensor->src[0], tensor->src[1], tensor);
+                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_ACC:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_ACC", true);
                 ggml_compute_forward_acc(params, tensor->src[0], tensor->src[1], tensor);
+                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_SUB:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_SUB", true);
                 ggml_compute_forward_sub(params, tensor->src[0], tensor->src[1], tensor);
+                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_MUL:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_MUL", true);
                 ggml_compute_forward_mul(params, tensor->src[0], tensor->src[1], tensor);
+                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_DIV:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_DIV", true);
                 ggml_compute_forward_div(params, tensor->src[0], tensor->src[1], tensor);
+                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_SQR:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_SQR", true);
                 ggml_compute_forward_sqr(params, tensor->src[0], tensor);
+                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_SQRT:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_SQRT", true);
                 ggml_compute_forward_sqrt(params, tensor->src[0], tensor);
+                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_LOG:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_LOG", true);
                 ggml_compute_forward_log(params, tensor->src[0], tensor);
+                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_SUM:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_SUM", true);
                 ggml_compute_forward_sum(params, tensor->src[0], tensor);
+                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_SUM_ROWS:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_SUM_ROWS", true);
                 ggml_compute_forward_sum_rows(params, tensor->src[0], tensor);
+                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_MEAN:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_MEAN", true);
                 ggml_compute_forward_mean(params, tensor->src[0], tensor);
+                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_ARGMAX:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_ARGMAX", true);
                 ggml_compute_forward_argmax(params, tensor->src[0], tensor);
+                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_REPEAT:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_REPEAT", true);
                 ggml_compute_forward_repeat(params, tensor->src[0], tensor);
+                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_REPEAT_BACK:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_REPEAT_BACK", true);
                 ggml_compute_forward_repeat_back(params, tensor->src[0], tensor);
+                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_CONCAT:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_CONCAT", true);
                 ggml_compute_forward_concat(params, tensor->src[0], tensor->src[1], tensor);
+                TracyCZoneEnd(__tracy_ctx);
             } break;
-        case GGML_OP_REMOVE_HEAD_ROW:
-            {
-                ggml_compute_forward_remove_head_row(params, tensor->src[0], tensor);
-            } break;    
-        case GGML_OP_GET_FIRST_COLS_BY_ROWS:
-            {
-                ggml_compute_forward_get_first_cols_by_rows(params, tensor->src[0], tensor);
-            } break;      
         case GGML_OP_SILU_BACK:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_SILU_BACK", true);
                 ggml_compute_forward_silu_back(params, tensor->src[0], tensor->src[1], tensor);
+                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_NORM:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_NORM", true);
                 ggml_compute_forward_norm(params, tensor->src[0], tensor);
+                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_BATCH_NORM:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_BATCH_NORM", true);
                 ggml_compute_forward_batch_norm(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], tensor->src[4], tensor);
+                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_RMS_NORM:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_RMS_NORM", true);
                 ggml_compute_forward_rms_norm(params, tensor->src[0], tensor);
+                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_RMS_NORM_BACK:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_RMS_NORM_BACK", true);
                 ggml_compute_forward_rms_norm_back(params, tensor->src[0], tensor->src[1], tensor);
+                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_GROUP_NORM:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_GROUP_NORM", true);
                 ggml_compute_forward_group_norm(params, tensor->src[0], tensor);
+                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_MUL_MAT:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_MUL_MAT", true);
                 ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor);
+                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_OUT_PROD:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_OUT_PROD", true);
                 ggml_compute_forward_out_prod(params, tensor->src[0], tensor->src[1], tensor);
+                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_SCALE:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_SCALE", true);
                 ggml_compute_forward_scale(params, tensor->src[0], tensor->src[1], tensor);
+                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_SET:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_SET", true);
                 ggml_compute_forward_set(params, tensor->src[0], tensor->src[1], tensor);
+                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_CPY:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_CPY", true);
                 ggml_compute_forward_cpy(params, tensor->src[0], tensor);
+                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_CONT:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_CONT", true);
                 ggml_compute_forward_cont(params, tensor->src[0], tensor);
+                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_RESHAPE:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_RESHAPE", true);
                 ggml_compute_forward_reshape(params, tensor->src[0], tensor);
+                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_VIEW:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_VIEW", true);
                 ggml_compute_forward_view(params, tensor->src[0]);
+                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_PERMUTE:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_PERMUTE", true);
                 ggml_compute_forward_permute(params, tensor->src[0]);
+                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_TRANSPOSE:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_TRANSPOSE", true);
                 ggml_compute_forward_transpose(params, tensor->src[0]);
+                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_GET_ROWS:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_GET_ROWS", true);
                 ggml_compute_forward_get_rows(params, tensor->src[0], tensor->src[1], tensor);
+                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_GET_ROWS_BACK:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_GET_ROWS_BACK", true);
                 ggml_compute_forward_get_rows_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
+                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_DIAG:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_DIAG", true);
                 ggml_compute_forward_diag(params, tensor->src[0], tensor);
+                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_DIAG_MASK_INF:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_DIAG_MASK_INF", true);
                 ggml_compute_forward_diag_mask_inf(params, tensor->src[0], tensor);
+                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_DIAG_MASK_ZERO:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_DIAG_MASK_ZERO", true);
                 ggml_compute_forward_diag_mask_zero(params, tensor->src[0], tensor);
+                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_SOFT_MAX:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_SOFT_MAX", true);
                 ggml_compute_forward_soft_max(params, tensor->src[0], tensor);
+                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_SOFT_MAX_BACK:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_SOFT_MAX_BACK", true);
                 ggml_compute_forward_soft_max_back(params, tensor->src[0], tensor->src[1], tensor);
+                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_ROPE:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_ROPE", true);
                 ggml_compute_forward_rope(params, tensor->src[0], tensor);
+                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_ROPE_BACK:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_ROPE_BACK", true);
                 ggml_compute_forward_rope_back(params, tensor->src[0], tensor);
+                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_ALIBI:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_ALIBI", true);
                 ggml_compute_forward_alibi(params, tensor->src[0], tensor);
+                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_CLAMP:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_CLAMP", true);
                 ggml_compute_forward_clamp(params, tensor->src[0], tensor);
+                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_CONV_1D:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_CONV_1D", true);
                 ggml_compute_forward_conv_1d(params, tensor->src[0], tensor->src[1], tensor);
+                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_CONV_1D_STAGE_0:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_CONV_1D_STAGE_0", true);
                 ggml_compute_forward_conv_1d_stage_0(params, tensor->src[0], tensor->src[1], tensor);
+                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_CONV_1D_STAGE_1:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_CONV_1D_STAGE_1", true);
                 ggml_compute_forward_conv_1d_stage_1(params, tensor->src[0], tensor);
+                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_CONV_1D_STAGE_2:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_CONV_1D_STAGE_2", true);
                 ggml_compute_forward_conv_1d_stage_2(params, tensor->src[0], tensor->src[1], tensor);
+                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_CONV_1D_GENERIC:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_CONV_1D_GENERIC", true);
                 ggml_compute_forward_conv_1d_generic(params, tensor->src[0], tensor->src[1], tensor);
+                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_CONV_1D_GENERIC_STAGE_0:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_CONV_1D_GENERIC_STAGE_0", true);
                 ggml_compute_forward_conv_1d_generic_stage_0(params, tensor->src[0], tensor->src[1], tensor);
+                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_CONV_1D_GENERIC_STAGE_1:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_CONV_1D_GENERIC_STAGE_1", true);
                 ggml_compute_forward_conv_1d_generic_stage_1(params, tensor->src[0], tensor->src[1], tensor);
+                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_CONV_2D:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_CONV_2D", true);
                 ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor);
+                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_CONV_TRANSPOSE_2D:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_CONV_TRANSPOSE_2D", true);
                 ggml_compute_forward_conv_transpose_2d(params, tensor->src[0], tensor->src[1], tensor);
+                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_POOL_1D:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_POOL_1D", true);
                 ggml_compute_forward_pool_1d(params, tensor->src[0], tensor);
+                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_POOL_2D:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_POOL_2D", true);
                 ggml_compute_forward_pool_2d(params, tensor->src[0], tensor);
+                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_UPSCALE:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_UPSCALE", true);
                 ggml_compute_forward_upscale(params, tensor->src[0], tensor);
+                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_FLASH_ATTN:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_FLASH_ATTN", true);
                 const int32_t t = ggml_get_op_params_i32(tensor, 0);
                 GGML_ASSERT(t == 0 || t == 1);
                 const bool masked = t != 0;
                 ggml_compute_forward_flash_attn(params, tensor->src[0], tensor->src[1], tensor->src[2], masked, tensor);
+                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_FLASH_FF:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_FLASH_FF", true);
                 ggml_compute_forward_flash_ff(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], tensor->src[4], tensor);
+                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_FLASH_ATTN_BACK:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_FLASH_ATTN_BACK", true);
                 int32_t t = ggml_get_op_params_i32(tensor, 0);
                 GGML_ASSERT(t == 0 || t == 1);
                 bool masked = t != 0;
                 ggml_compute_forward_flash_attn_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], masked, tensor);
+                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_WIN_PART:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_WIN_PART", true);
                 ggml_compute_forward_win_part(params, tensor->src[0], tensor);
+                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_WIN_UNPART:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_WIN_UNPART", true);
                 ggml_compute_forward_win_unpart(params, tensor->src[0], tensor);
+                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_UNARY:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_UNARY", true);
                 ggml_compute_forward_unary(params, tensor->src[0], tensor);
+                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_GET_REL_POS:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_GET_REL_POS", true);
                 ggml_compute_forward_get_rel_pos(params, tensor->src[0], tensor);
+                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_ADD_REL_POS:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_ADD_REL_POS", true);
                 ggml_compute_forward_add_rel_pos(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
+                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_MAP_UNARY:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_MAP_UNARY", true);
                 ggml_unary_op_f32_t fun;
                 memcpy(&fun, tensor->op_params, sizeof(fun));
                 ggml_compute_forward_map_unary(params, tensor->src[0], tensor, fun);
+                TracyCZoneEnd(__tracy_ctx);
             }
             break;
         case GGML_OP_MAP_BINARY:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_MAP_BINARY", true);
                 ggml_binary_op_f32_t fun;
                 memcpy(&fun, tensor->op_params, sizeof(fun));
                 ggml_compute_forward_map_binary(params, tensor->src[0], tensor->src[1], tensor, fun);
+                TracyCZoneEnd(__tracy_ctx);
             }
             break;
         case GGML_OP_MAP_CUSTOM1_F32:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_MAP_CUSTOM1_F32", true);
                 ggml_custom1_op_f32_t fun;
                 memcpy(&fun, tensor->op_params, sizeof(fun));
                 ggml_compute_forward_map_custom1_f32(params, tensor->src[0], tensor, fun);
+                TracyCZoneEnd(__tracy_ctx);
             }
             break;
         case GGML_OP_MAP_CUSTOM2_F32:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_MAP_CUSTOM2_F32", true);
                 ggml_custom2_op_f32_t fun;
                 memcpy(&fun, tensor->op_params, sizeof(fun));
                 ggml_compute_forward_map_custom2_f32(params, tensor->src[0], tensor->src[1], tensor, fun);
+                TracyCZoneEnd(__tracy_ctx);
             }
             break;
         case GGML_OP_MAP_CUSTOM3_F32:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_MAP_CUSTOM3_F32", true);
                 ggml_custom3_op_f32_t fun;
                 memcpy(&fun, tensor->op_params, sizeof(fun));
                 ggml_compute_forward_map_custom3_f32(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor, fun);
+                TracyCZoneEnd(__tracy_ctx);
             }
             break;
         case GGML_OP_MAP_CUSTOM1:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_MAP_CUSTOM1", true);
                 ggml_compute_forward_map_custom1(params, tensor->src[0], tensor);
+                TracyCZoneEnd(__tracy_ctx);
             }
             break;
         case GGML_OP_MAP_CUSTOM2:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_MAP_CUSTOM2", true);
                 ggml_compute_forward_map_custom2(params, tensor->src[0], tensor->src[1], tensor);
+                TracyCZoneEnd(__tracy_ctx);
             }
             break;
         case GGML_OP_MAP_CUSTOM3:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_MAP_CUSTOM3", true);
                 ggml_compute_forward_map_custom3(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
+                TracyCZoneEnd(__tracy_ctx);
             }
             break;
         case GGML_OP_CROSS_ENTROPY_LOSS:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_CROSS_ENTROPY_LOSS", true);
                 ggml_compute_forward_cross_entropy_loss(params, tensor->src[0], tensor->src[1], tensor);
+                TracyCZoneEnd(__tracy_ctx);
             }
             break;
         case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
             {
+                TracyCZoneN(__tracy_ctx, "GGML_OP_CROSS_ENTROPY_LOSS_BACK", true);
                 ggml_compute_forward_cross_entropy_loss_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
+                TracyCZoneEnd(__tracy_ctx);
             }
             break;
         case GGML_OP_NONE:
@@ -17358,14 +17410,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
             {
                 GGML_ASSERT(false); // TODO: implement
             } break;
-        case GGML_OP_REMOVE_HEAD_ROW:
-            {
-                GGML_ASSERT(false); // TODO: implement
-            } break;
-        case GGML_OP_GET_FIRST_COLS_BY_ROWS:
-            {
-                GGML_ASSERT(false); // TODO: implement
-            } break;
         case GGML_OP_SILU_BACK:
             {
                 GGML_ASSERT(false); // TODO: not implemented
@@ -19019,13 +19063,17 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) {
 }
 
 void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
+    TracyCZoneN(_tracy_graph, "ggml_graph_plan", true);
     struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads);
+    TracyCZoneEnd(_tracy_graph);
 
     struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
 
     cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
 
+    TracyCZoneN(_tracy_compute, "ggml_graph_compute", true);
     ggml_graph_compute(cgraph, &cplan);
+    TracyCZoneEnd(_tracy_compute);
 }
 
 struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) {

+ 0 - 0
ggml/test.wav → ggml/test_data/test.wav


+ 61 - 26
ggml/test_unity_cpp.py

@@ -8,6 +8,7 @@ from typing import Any, Iterator, List, Tuple
 
 import fairseq2.nn
 import fairseq2.nn.transformer
+from fairseq2.nn.padding import PaddingMask
 import numpy as np
 import pytest
 import torch
@@ -22,7 +23,6 @@ from ctypes_utils import NULLPTR, Ptr
 from ggml import NativeObj
 from ggml_convert import convert_model, read_layer_config
 
-
 Ctx = ggml.ggml_context_p
 
 UNITY_MODELS = Path(__file__).parent / "examples/unity/models"
@@ -31,7 +31,7 @@ CTX_PARAMS = ggml.ggml_init_params(mem_size=1024 * 1024 * 1024 * 5, mem_buffer=N
 FAIRSEQ2_CPP = Path(__file__).parent / "examples/unity/fairseq2.cpp"
 UNITY_FLASH_ATTN = "\n# define UNITY_FLASH_ATTN 0\n" not in FAIRSEQ2_CPP.read_text()
 
-DATA = Path(__file__).parent
+DATA = Path(__file__).parent / "test_data"
 DATA_DEV = DATA / "dev"
 if not DATA_DEV.exists():
     DATA_DEV = Path(
@@ -329,7 +329,9 @@ def test_MultiheadAttention_forward_cross_attn_with_cache(
                 assert np.allclose(
                     state.get()[0].transpose(1, 2).numpy(),
                     ggml.to_numpy(
-                        nodes[b"text_decoder.layers.0.encoder_decoder_attn.k_cache (view)"]
+                        nodes[
+                            b"text_decoder.layers.0.encoder_decoder_attn.k_cache (view)"
+                        ]
                     ),
                     atol=1e-3,
                 )
@@ -378,7 +380,8 @@ def test_StandardConformerEncoderLayer_forward(ctx: Ctx, g_model: c_void_p) -> N
         pytest.skip(reason=f"Folder {DATA_DEV} not found !")
 
     x = torch.load(DATA_DEV / "seqs_before_conformer_block.pt")
-    padding_mask = torch.ones((1, x.shape[1]))
+    padding_mask = PaddingMask(torch.ones(1, x.shape[1]),x.shape[1])
+
     layer = pt_model.speech_encoder.inner.layers[0]
     gx = ggml.from_numpy(ctx, x[0])
     ggml.ggml_set_name(gx, b"x")
@@ -477,25 +480,32 @@ def test_StandardConformerEncoder_forward(ctx: Ctx, g_model: c_void_p) -> None:
     gf = ggml.ggml_build_forward(gy)
     ggml.ggml_graph_compute_with_ctx(ctx, ctypes.pointer(gf), 1)
 
-    converter = WaveformToFbankConverter(
-        num_mel_bins=80,
-        waveform_scale=2**15,
-        channel_last=True,
-        standardize=True,
-    )
-    converter_input = {
-        "waveform": wav.transpose(0, 1),
-        "sample_rate": 16000.0,
-        "format": -1,
-    }
-
     y = ggml.to_numpy(gy)
-    speech_encoder_input = pt_model.speech_encoder_frontend(
-        converter(converter_input)["fbank"].unsqueeze(0), None
-    )[0]
 
-    y_exp, _ = pt_model.speech_encoder(speech_encoder_input, None)
-    y_exp = y_exp.numpy()  # remove batch dimension
+    cache = DATA / "test_StandardConformerEncoder_forward.npy"
+    if not cache.exists():
+        converter = WaveformToFbankConverter(
+            num_mel_bins=80,
+            waveform_scale=2**15,
+            channel_last=True,
+            standardize=True,
+        )
+        converter_input = {
+            "waveform": wav.transpose(0, 1),
+            "sample_rate": 16000.0,
+            "format": -1,
+        }
+
+        pt_model = load_pt_model()
+        speech_encoder_input = pt_model.speech_encoder_frontend(
+            converter(converter_input)["fbank"].unsqueeze(0), None
+        )[0]
+
+        y_exp, _ = pt_model.speech_encoder(speech_encoder_input, None)
+        y_exp = y_exp.numpy()
+        np.save(cache, y_exp)
+    else:
+        y_exp = np.load(cache)
 
     assert y.shape == y_exp.shape
     assert np.allclose(
@@ -512,7 +522,7 @@ def test_WaveformToFbank_forward(ctx: Ctx, g_model: c_void_p) -> None:
         standardize=True,
     )
     extractor = Wav2Vec2FbankFeatureExtractor(80, stride=2, sample_every_k=1)
-    wav, _ = torchaudio.load(DATA / "test.wav")
+    wav, _ = torchaudio.load(DATA / "LJ037-0171_sr16k_test.wav")
     gx = ggml.from_numpy(ctx, wav * 2**15)  # Apply scale before sending into ggml!
     ggml.ggml_set_name(gx, b"x")
 
@@ -540,7 +550,7 @@ def test_PositionalEmbedding_forward(ctx: Ctx, g_model: c_void_p) -> None:
     pos_encoder = fairseq2.nn.SinusoidalPositionEncoder(1024, 55, _legacy_pad_idx=0)
     y_exp = pos_encoder(seq, None)[0].numpy()
 
-    gseq = ggml.from_numpy(ctx, seq[0].numpy())
+    gseq = ggml.from_numpy(ctx, seq[0].clone().numpy())
     ggml.ggml_set_name(gseq, b"seq")
     gy = ggml.forward(
         "PositionalEmbedding", g_model, "text_decoder_frontend.pos_encoder", gseq
@@ -633,6 +643,32 @@ def test_StandardTransformerDecoder_forward(ctx: Ctx, g_model: c_void_p) -> None
     assert np.allclose(y_exp, y, atol=1e-4 if UNITY_FLASH_ATTN else 1e-3)
 
 
+def test_tokenizer(ctx: Ctx) -> None:
+    tokenizer = unity.load_unity_text_tokenizer("seamlessM4T_medium")
+    enc = tokenizer.create_encoder(task="translation", lang="eng", mode="source")
+
+    spm_path = DATA / "seamlessM4T_medium.spm.ggml"
+    # if not spm_path.exists():
+    if True:
+        vocab = ggml_convert.read_vocab(tokenizer)
+        ggml_convert.write_ggml_file(spm_path, {"spm_vocab_only": True}, {}, vocab, {})
+
+    g_model = ggml.load_fairseq2_ggml_file(spm_path)
+    ggml.lib.fairseq2_model_set_inference_ctx(g_model.ptr, ctx)
+
+    expected = enc("We are all in a yellow submarine.").tolist()[1:]
+    tokens = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_I32, 256)
+    ggml.fairseq2_spm_tokenize(
+        g_model.ptr, b"We are all in a yellow submarine.", tokens
+    )
+    res = ggml.to_numpy(tokens).tolist()
+    assert expected == res
+
+    out = ctypes.create_string_buffer(144)
+    ggml.fairseq2_spm_detokenize(g_model.ptr, tokens, out)
+    assert ctypes.string_at(out) == b"We are all in a yellow submarine."
+
+
 def test_t2tt(ctx: Ctx, g_model: c_void_p) -> None:
     src_lang = "eng"
     src_text = "We are all in a yellow submarine."
@@ -700,6 +736,7 @@ def test_t2tt(ctx: Ctx, g_model: c_void_p) -> None:
         unk_idx=1,
         bos_idx=2,
         eos_idx=3,
+        num_threads=16,
     )
 
     result_ptr = ggml.generate_sequence(g_model, job, encoder_out, NULLPTR, ctx)
@@ -789,9 +826,7 @@ def test_s2tt(ctx: Ctx, g_model: c_void_p):
     )
     result_ptr = ggml.generate_sequence(g_model, Ptr(job), encoder_out, NULLPTR, ctx)
     results = [result_ptr[i] for i in range(beam_size) if result_ptr[i].seq != None]
-    assert_hypotheses(
-        exp["hypotheses"], results, score_rtol=1e-2, step_scores_rtol=0.1
-    )
+    assert_hypotheses(exp["hypotheses"], results, score_rtol=1e-2, step_scores_rtol=0.1)
 
 
 def assert_hypotheses(

+ 2 - 2
ggml/third_party_ggml.py

@@ -774,7 +774,7 @@ class ggml_init_params(ctypes.Structure):
     """
 
     _fields_ = [
-        ("mem_size", ctypes.c_size_t),
+        ("mem_size", ctypes.c_int64),
         ("mem_buffer", ctypes.c_void_p),
         ("no_alloc", ctypes.c_bool),
     ]
@@ -1257,7 +1257,7 @@ def ggml_get_mem_size(ctx: ggml_context_p) -> int:
 
 
 lib.ggml_get_mem_size.argtypes = [ggml_context_p]
-lib.ggml_get_mem_size.restype = ctypes.c_size_t
+lib.ggml_get_mem_size.restype = ctypes.c_int64
 
 
 # GGML_API size_t  ggml_get_max_tensor_size(const struct ggml_context * ctx);

+ 1 - 1
src/seamless_communication/inference/translator.py

@@ -142,7 +142,7 @@ class Translator(nn.Module):
             dtype=dtype,
         )
         self.collate = Collater(
-            pad_value=self.text_tokenizer.vocab_info.pad_idx, pad_to_multiple=2
+            pad_value=self.text_tokenizer.vocab_info.pad_idx or 0, pad_to_multiple=2
         )
         self.vocoder = None
         if vocoder_name_or_card is not None and (