1 年之前 · 522b97234e
--- a/ggml/ctypes_utils.py
+++ b/ggml/ctypes_utils.py
@@ -32,9 +32,11 @@ def _py_type_to_ctype(t: type):
 
				         raise ValueError(
			
 
				             f"Type parsing of '{t}' isn't supported, you need to provide a real type annotation."
			
 
				         )
			
 
				-    if t.__module__ == "ctypes":
			
 
				-        return t
			
 
				+    if t is None:
			
 
				+        return None
			
 
				     if isinstance(t, type):
			
 
				+        if t.__module__ == "ctypes":
			
 
				+            return t
			
 
				         if issubclass(t, ctypes.Structure):
			
 
				             return t
			
 
				         if issubclass(t, ctypes._Pointer):
			
@@ -47,8 +49,6 @@ def _py_type_to_ctype(t: type):
 
				         return ctypes.c_bool
			
 
				     if t is str:
			
 
				         return ctypes.c_char_p
			
 
				-    if t is None:
			
 
				-        return None
			
 
				 
			
 
				     if getattr(t, "__origin__", None) is Ptr:
			
 
				         pointee = _py_type_to_ctype(t.__args__[0])
			
--- a/ggml/examples/unity/fairseq2.cpp
+++ b/ggml/examples/unity/fairseq2.cpp
@@ -8,6 +8,11 @@
 
				 #include <iostream>
			
 
				 #include <fnmatch.h>
			
 
				 
			
 
				+void ggml_detach(ggml_tensor* a) {
			
 
				+    a->op = GGML_OP_NONE;
			
 
				+    std::fill(a->src, a->src + GGML_MAX_SRC, nullptr);
			
 
				+}
			
 
				+
			
 
				 /// allocate the fairseq2 model and hyperparameters
			
 
				 extern "C" fairseq2_model* fairseq2_model_alloc() {
			
 
				     // pre-allocate some memory to write hyperparameters and tensors pointers
			
@@ -16,11 +21,14 @@ extern "C" fairseq2_model* fairseq2_model_alloc() {
 
				     return model;
			
 
				 }
			
 
				 
			
 
				-void fairseq2_kv_cache_alloc(const fairseq2_model& model, std::size_t beam_size, std::size_t max_seq_len) {
			
 
				+extern "C" void fairseq2_kv_cache_alloc(const fairseq2_model& model, int beam_size, int max_seq_len) {
			
 
				     // Note: we only allocate the cache for the decoder attention.
			
 
				     // For encoder attention since we compute it all at once,
			
 
				     // the allocation is delayed to the first forward pass, to not over allocate.
			
 
				     auto layer_glob_c = "*decoder.*attn.k_proj.weight";
			
 
				+    ggml_tensor* self_attn_mask = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, max_seq_len, max_seq_len);
			
 
				+    self_attn_mask = ggml_diag_mask_inf(model.ctx, self_attn_mask, 0);
			
 
				+
			
 
				     for (auto named_tensor : model.tensors) {
			
 
				         const std::string& name = named_tensor.first;
			
 
				         if (::fnmatch(layer_glob_c, name.c_str(), 0) == FNM_NOMATCH)
			
@@ -31,6 +39,7 @@ void fairseq2_kv_cache_alloc(const fairseq2_model& model, std::size_t beam_size,
 
				         model.kv_cache[name.substr(0, name.size() - 14)] = KeyValueTensor {
			
 
				             ggml_new_tensor_3d(model.ctx, k_proj->type, model_dim, max_seq_len, beam_size),
			
 
				             ggml_new_tensor_3d(model.ctx, k_proj->type, model_dim, max_seq_len, beam_size),
			
 
				+            self_attn_mask,
			
 
				             0,
			
 
				         };
			
 
				     }
			
@@ -43,21 +52,67 @@ bool has_kv_cache(const fairseq2_model& model) {
 
				 // copy k and v to kv cache
			
 
				 // kv.full_k[step_nr] = k;
			
 
				 // kv.full_v[step_nr] = v;
			
 
				-void append_to_prev_kv(const fairseq2_model& model, const std::string& prefix, ggml_tensor** k, ggml_tensor** v) {
			
 
				+void append_to_prev_kv(const fairseq2_model& model, const std::string& prefix, ggml_tensor** k, ggml_tensor** v, ggml_tensor** self_attn_mask) {
			
 
				     KeyValueTensor& kv = model.kv_cache[prefix];
			
 
				     int step_nr = kv.step_nr;
			
 
				 
			
 
				     ggml_tensor* full_k = kv.full_k;
			
 
				     ggml_tensor* full_v = kv.full_v;
			
 
				 
			
 
				+    // (N, S_kv, K_proj)
			
 
				+    GGML_ASSERT((*k)->ne[1] == 1);  // TODO I think we could handle adding a full prefix sequence
			
 
				     ggml_tensor* updated_k = ggml_set_2d_inplace(model.ctx, full_k, *k, full_k->nb[2], full_k->nb[1] * step_nr);
			
 
				     ggml_tensor* updated_v = ggml_set_2d_inplace(model.ctx, full_v, *v, full_v->nb[2], full_v->nb[1] * step_nr);
			
 
				 
			
 
				     *k = ggml_slice(model.ctx, updated_k, 1, 0, step_nr + 1);
			
 
				     *v = ggml_slice(model.ctx, updated_v, 1, 0, step_nr + 1);
			
 
				+
			
 
				+    // qk is (B * H, Sq, Sk) == (B*H, 1, Sk) in incremental mode
			
 
				+    // we return the Sq slice of the (Sq, Sk) attention mask
			
 
				+    *self_attn_mask = ggml_slice(
			
 
				+        model.ctx,
			
 
				+        ggml_slice(model.ctx, kv.self_attn_mask, 0, 0, step_nr + 1),
			
 
				+        1,
			
 
				+        step_nr,
			
 
				+        step_nr + 1
			
 
				+    );
			
 
				+
			
 
				     kv.step_nr = step_nr + 1;
			
 
				 }
			
 
				 
			
 
				+// variant of ggml_get_rows that allows for a with more than 2 dims.
			
 
				+ggml_tensor* ggml_get_rows2(ggml_context* ctx, ggml_tensor* a, ggml_tensor* b) {
			
 
				+    int flattened = 0;
			
 
				+    GGML_ASSERT(a->n_dims <= 3);
			
 
				+    if (a->n_dims == 3) {
			
 
				+        flattened = a->ne[0];
			
 
				+        a = ggml_flatten_1d(ctx, a, 0);
			
 
				+    }
			
 
				+    a = ggml_get_rows(ctx, a, b);
			
 
				+    if (flattened) {
			
 
				+        a = ggml_unflatten_1d(ctx, a, 0, flattened);
			
 
				+    }
			
 
				+    return a;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void _reorder_kv_cache(ggml_context* ctx, ggml_cgraph* gf, KeyValueTensor& kv, ggml_tensor* new_order) {
			
 
				+    ggml_detach(kv.full_k);
			
 
				+    kv.full_k = ggml_get_rows2(ctx, kv.full_k, new_order);
			
 
				+    ggml_build_forward_expand(gf, kv.full_k);
			
 
				+
			
 
				+    ggml_detach(kv.full_v);
			
 
				+    kv.full_v = ggml_get_rows2(ctx, kv.full_v, new_order);
			
 
				+    ggml_build_forward_expand(gf, kv.full_v);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void reorder_kv_cache(const fairseq2_model& model, ggml_cgraph* gf, ggml_tensor* new_order) {
			
 
				+    ggml_context* ctx = model.ctx;
			
 
				+    for (auto& named_kv : model.kv_cache) {
			
 
				+        _reorder_kv_cache(ctx, gf, named_kv.second, new_order);
			
 
				+    }
			
 
				+}
			
 
				 
			
 
				 
			
 
				 inline double model_layer_config_d(const fairseq2_model& model, std::string name) {
			
@@ -281,18 +336,18 @@ extern "C" ggml_tensor* MultiheadAttention_forward(
 
				                 ggml_set_name(k, "k");
			
 
				                 v = Linear_forward(model, prefix + ".v_proj", values);
			
 
				                 ggml_set_name(v, "v");
			
 
				-                model.kv_cache[prefix] = KeyValueTensor{k, v, 1};
			
 
				+                model.kv_cache[prefix] = KeyValueTensor{k, v, nullptr, 1};
			
 
				             } else {
			
 
				                 k = kv_cache.full_k;
			
 
				                 v = kv_cache.full_v;
			
 
				             }
			
 
				-        } else {
			
 
				+        } else { // self attention
			
 
				             // (1, K) -> (N, 1, K_proj)
			
 
				             k = Linear_forward(model, prefix + ".k_proj", keys);
			
 
				             // (1, V) -> (N, 1, V_proj)
			
 
				             v = Linear_forward(model, prefix + ".v_proj", values);
			
 
				 
			
 
				-            append_to_prev_kv(model, prefix, &k, &v);
			
 
				+            append_to_prev_kv(model, prefix, &k, &v, &attn_mask);
			
 
				         }
			
 
				     }
			
 
				     k = _reshape_num_head(ctx, k, head_dim);  // (B * H, Sk, H_dim)
			
@@ -315,7 +370,7 @@ extern "C" ggml_tensor* MultiheadAttention_forward(
 
				     ggml_set_name(qk, "qk_scaled");
			
 
				 
			
 
				     // TODO: Should we replace this by ggml_diag_mask_inf ?
			
 
				-    if (attn_mask) qk = ggml_add(ctx, qk, attn_mask);
			
 
				+    if (attn_mask) qk = ggml_add_inplace(ctx, qk, attn_mask);
			
 
				     // TODO: upgrade qk to float32 if needed
			
 
				     ggml_tensor* attn_weights = ggml_soft_max(ctx, qk);  // (B * H, S, Sk)
			
 
				     ggml_set_name(attn_weights, "attn_weights");
			
@@ -992,7 +1047,7 @@ ggml_tensor* ggml_expand_2d(ggml_context* ctx, ggml_tensor* x, int64_t ne0, int6
 
				     return y;
			
 
				 }
			
 
				 
			
 
				-void _bootstrap_seqs_and_scores(
			
 
				+extern "C" void _bootstrap_seqs_and_scores(
			
 
				     fairseq2_model& model,
			
 
				     const SequenceGeneratorJob& job,
			
 
				     ggml_tensor* full_seqs,
			
@@ -1078,14 +1133,6 @@ int topk(
 
				 }
			
 
				 
			
 
				 
			
 
				-void ggml_detach(ggml_tensor* a) {
			
 
				-    a->op = GGML_OP_NONE;
			
 
				-    for (int i = 0; i < GGML_MAX_SRC; ++i) {
			
 
				-        a->src[i] = nullptr;
			
 
				-    }
			
 
				-}
			
 
				-
			
 
				-
			
 
				 /// Copies the sequence and scores of a given candidate beam.
			
 
				 void _finalize_hypothesis(
			
 
				     const SequenceGeneratorJob& job,
			
@@ -1303,14 +1350,14 @@ extern "C" Hypothesis* generate_sequence(
 
				             new_seqs->type = GGML_TYPE_F32;
			
 
				             new_seqs = ggml_get_rows(ctx, seqs, beam_indices);
			
 
				             new_scores = ggml_get_rows(ctx, scores, beam_indices);
			
 
				-            gf = ggml_build_forward(new_seqs);
			
 
				-            ggml_graph_compute_with_ctx(ctx, &gf, 1);
			
 
				-            ggml_detach(new_seqs);
			
 
				-            new_seqs->type = GGML_TYPE_I32;
			
 
				+            ggml_cgraph gf_reorder = ggml_build_forward(new_seqs);
			
 
				+            ggml_build_forward_expand(&gf_reorder, new_scores);
			
 
				+            reorder_kv_cache(model, &gf_reorder, beam_indices);
			
 
				 
			
 
				-            gf = ggml_build_forward(new_scores);
			
 
				-            ggml_graph_compute_with_ctx(ctx, &gf, 1);
			
 
				+            ggml_graph_compute_with_ctx(ctx, &gf_reorder, 1);
			
 
				+            ggml_detach(new_seqs);
			
 
				             ggml_detach(new_scores);
			
 
				+            new_seqs->type = GGML_TYPE_I32;
			
 
				         }
			
 
				         
			
 
				         // new_seqs[:, step_nr + 1] = next_tokens
			
--- a/ggml/examples/unity/fairseq2.h
+++ b/ggml/examples/unity/fairseq2.h
@@ -9,8 +9,8 @@
 
				 struct KeyValueTensor {
			
 
				     ggml_tensor* full_k;
			
 
				     ggml_tensor* full_v;
			
 
				+    ggml_tensor* self_attn_mask;
			
 
				     int step_nr;
			
 
				-    // ggml_tensor* key_padding_mask;
			
 
				 };
			
 
				 
			
 
				 struct fairseq2_model {
			
--- a/ggml/ggml.py
+++ b/ggml/ggml.py
@@ -453,4 +453,8 @@ def _testing_return_hypothesis_ptr(ctx: ggml_context_p) -> Ptr[Hypothesis]:
 
				 
			
 
				 @c_fn(lib)
			
 
				 def fairseq2_model_layer_config_int(model: ctypes.c_void_p, name: str) -> int:
			
 
				+    return -1
			
 
				+
			
 
				+@c_fn(lib)
			
 
				+def fairseq2_kv_cache_alloc(model: ctypes.c_void_p, beam_size: int, max_seq_len: int) -> None:
			
 
				     pass
			
--- a/ggml/test_unity_cpp.py
+++ b/ggml/test_unity_cpp.py
@@ -22,6 +22,7 @@ from seamless_communication.models.inference.translator import Translator, Modal
 
				 from fairseq2.data.audio import WaveformToFbankConverter
			
 
				 import torchaudio
			
 
				 from fairseq2.models.wav2vec2.feature_extractor import Wav2Vec2FbankFeatureExtractor
			
 
				+
			
 
				 Ctx = ggml.ggml_context_p
			
 
				 
			
 
				 UNITY_MODELS = Path(__file__).parent / "examples/unity/models"
			
@@ -241,6 +242,42 @@ def test_MultiheadAttention_forward(
 
				     assert np.allclose(y_exp, y, atol=1e-2 if naive_attn else 1e-4)
			
 
				 
			
 
				 
			
 
				+def test_MultiheadAttention_forward_with_state_bag(ctx: Ctx, g_model: c_void_p) -> None:
			
 
				+    pt_model = load_pt_model()
			
 
				+    self_attn = pt_model.text_encoder.layers[0].self_attn
			
 
				+
			
 
				+    x = torch.empty((2, 21, 1024))
			
 
				+    torch.random.manual_seed(0)
			
 
				+    torch.nn.init.uniform_(x, -1, 1)
			
 
				+
			
 
				+    state_bag = fairseq2.nn.IncrementalStateBag()
			
 
				+
			
 
				+    ggml.fairseq2_kv_cache_alloc(g_model, 2, 21)
			
 
				+    # Incremental decoding
			
 
				+    for t in range(3):
			
 
				+        xq, xk = x[:, t : t + 1], x[:, t : t + 1]
			
 
				+        y_exp = self_attn(xq, None, xk, xk, state_bag=state_bag).numpy()
			
 
				+        assert y_exp.shape == (2, 1, 1024)
			
 
				+
			
 
				+        gxq = ggml.from_numpy(ctx, xq.contiguous())
			
 
				+        gxk = ggml.from_numpy(ctx, xk.contiguous())
			
 
				+        ggml.ggml_set_name(gxk, b"xk")
			
 
				+        gy = ggml.forward(
			
 
				+            "MultiheadAttention",
			
 
				+            g_model,
			
 
				+            "text_encoder.layers.0.self_attn",
			
 
				+            gxq,
			
 
				+            gxk,
			
 
				+            gxk,
			
 
				+            None,  # type: ignore
			
 
				+        )
			
 
				+        gf = ggml.ggml_build_forward(gy)
			
 
				+        ggml.ggml_graph_compute_with_ctx(ctx, ctypes.pointer(gf), 1)
			
 
				+
			
 
				+        y = ggml.to_numpy(gy)
			
 
				+        assert np.allclose(y, y_exp, atol=1e-2)
			
 
				+
			
 
				+
			
 
				 def test_StandardTransformerEncoderLayer_forward(ctx: Ctx, g_model: c_void_p) -> None:
			
 
				     x = torch.empty((2, 21, 1024))
			
 
				     padding_mask = torch.ones((2, 21))
			
@@ -272,11 +309,12 @@ def test_StandardTransformerEncoderLayer_forward(ctx: Ctx, g_model: c_void_p) ->
 
				     assert y.shape == y_exp.shape
			
 
				     assert np.allclose(y_exp, y, atol=1e-4 if UNITY_FLASH_ATTN else 1e-2)
			
 
				 
			
 
				-def test_StandardConformerEncoderLayer_forward(
			
 
				-    ctx: Ctx, g_model: c_void_p
			
 
				-) -> None:
			
 
				+
			
 
				+def test_StandardConformerEncoderLayer_forward(ctx: Ctx, g_model: c_void_p) -> None:
			
 
				     pt_model = load_pt_model()
			
 
				-    x = torch.load("/private/home/dnn/internal_sc/seamless_communication/ggml/examples/unity/dev/seqs_before_conformer_block.pt")
			
 
				+    x = torch.load(
			
 
				+        "/private/home/dnn/internal_sc/seamless_communication/ggml/examples/unity/dev/seqs_before_conformer_block.pt"
			
 
				+    )
			
 
				     padding_mask = torch.ones((1, x.shape[1]))
			
 
				     layer = pt_model.speech_encoder.inner.layers[0]
			
 
				     gx = ggml.from_numpy(ctx, x[0])
			
@@ -304,7 +342,9 @@ def test_StandardConformerEncoderAdaptorLayer_forward(
 
				     ctx: Ctx, g_model: c_void_p
			
 
				 ) -> None:
			
 
				     pt_model = load_pt_model()
			
 
				-    x = torch.load("/private/home/dnn/internal_sc/seamless_communication/ggml/examples/unity/dev/seqs_before_adaptor.pt")
			
 
				+    x = torch.load(
			
 
				+        "/private/home/dnn/internal_sc/seamless_communication/ggml/examples/unity/dev/seqs_before_adaptor.pt"
			
 
				+    )
			
 
				     layer = pt_model.speech_encoder.adaptor_layers[0]
			
 
				     gx = ggml.from_numpy(ctx, x[0])
			
 
				     ggml.ggml_set_name(gx, b"x")
			
@@ -356,12 +396,13 @@ def test_StandardTransformerEncoder_forward(ctx: Ctx, g_model: c_void_p) -> None
 
				     assert y.shape == y_exp.shape
			
 
				     assert np.allclose(y_exp, y, atol=1e-4)
			
 
				 
			
 
				-def test_StandardConformerEncoder_forward(
			
 
				-    ctx: Ctx, g_model: c_void_p
			
 
				-) -> None:
			
 
				+
			
 
				+def test_StandardConformerEncoder_forward(ctx: Ctx, g_model: c_void_p) -> None:
			
 
				     pt_model = load_pt_model()
			
 
				-    wav, _ = torchaudio.load("/private/home/dnn/internal_sc/seamless_communication/ggml/examples/unity/test.wav")
			
 
				-    gx = ggml.from_numpy(ctx, wav * 2**15) # Apply scale before sending into ggml!
			
 
				+    wav, _ = torchaudio.load(
			
 
				+        "/private/home/dnn/internal_sc/seamless_communication/ggml/examples/unity/test.wav"
			
 
				+    )
			
 
				+    gx = ggml.from_numpy(ctx, wav * 2**15)  # Apply scale before sending into ggml!
			
 
				     ggml.ggml_set_name(gx, b"x")
			
 
				     gy = ggml.forward(
			
 
				         "StandardConformerEncoder",
			
@@ -381,24 +422,25 @@ def test_StandardConformerEncoder_forward(
 
				     )
			
 
				     converter_input = {
			
 
				         "waveform": wav.transpose(0, 1),
			
 
				-        "sample_rate": 16000.,
			
 
				+        "sample_rate": 16000.0,
			
 
				         "format": -1,
			
 
				     }
			
 
				 
			
 
				     y = ggml.to_numpy(gy)
			
 
				-    speech_encoder_input = pt_model.speech_encoder_frontend(converter(converter_input)["fbank"].unsqueeze(0), None)[0]
			
 
				+    speech_encoder_input = pt_model.speech_encoder_frontend(
			
 
				+        converter(converter_input)["fbank"].unsqueeze(0), None
			
 
				+    )[0]
			
 
				 
			
 
				     y_exp, _ = pt_model.speech_encoder(speech_encoder_input, None)
			
 
				     y_exp = y_exp.numpy()  # remove batch dimension
			
 
				 
			
 
				     assert y.shape == y_exp.shape
			
 
				-    assert np.allclose(y_exp, y, atol=1e-2) # There are 10 elements in a 137*1024 tensor with error >1e-2
			
 
				-
			
 
				+    assert np.allclose(
			
 
				+        y_exp, y, atol=1e-2
			
 
				+    )  # There are 10 elements in a 137*1024 tensor with error >1e-2
			
 
				 
			
 
				 
			
 
				-def test_WaveformToFbank_forward(
			
 
				-    ctx: Ctx, g_model: c_void_p
			
 
				-) -> None:
			
 
				+def test_WaveformToFbank_forward(ctx: Ctx, g_model: c_void_p) -> None:
			
 
				     pt_model = load_pt_model()
			
 
				     converter = WaveformToFbankConverter(
			
 
				         num_mel_bins=80,
			
@@ -407,30 +449,27 @@ def test_WaveformToFbank_forward(
 
				         standardize=True,
			
 
				     )
			
 
				     extractor = Wav2Vec2FbankFeatureExtractor(80, 2, 1)
			
 
				-    wav, _ = torchaudio.load("/private/home/dnn/internal_sc/seamless_communication/ggml/examples/unity/test.wav")
			
 
				-    gx = ggml.from_numpy(ctx, wav * 2**15) # Apply scale before sending into ggml!
			
 
				+    wav, _ = torchaudio.load(
			
 
				+        "/private/home/dnn/internal_sc/seamless_communication/ggml/examples/unity/test.wav"
			
 
				+    )
			
 
				+    gx = ggml.from_numpy(ctx, wav * 2**15)  # Apply scale before sending into ggml!
			
 
				     ggml.ggml_set_name(gx, b"x")
			
 
				 
			
 
				-    gy = ggml.forward(
			
 
				-        "WaveformToFbank",
			
 
				-        g_model,
			
 
				-        "",
			
 
				-        gx
			
 
				-    )
			
 
				+    gy = ggml.forward("WaveformToFbank", g_model, "", gx)
			
 
				     gf = ggml.ggml_build_forward(gy)
			
 
				     ggml.ggml_graph_compute_with_ctx(ctx, ctypes.pointer(gf), 1)
			
 
				 
			
 
				     y = ggml.to_numpy(gy)
			
 
				     converter_input = {
			
 
				         "waveform": wav.transpose(0, 1),
			
 
				-        "sample_rate": 16000.,
			
 
				+        "sample_rate": 16000.0,
			
 
				         "format": -1,
			
 
				     }
			
 
				     y_exp = extractor(converter(converter_input)["fbank"].unsqueeze(0), None)[0]
			
 
				     y_exp = y_exp.numpy()
			
 
				 
			
 
				     assert y.shape == y_exp.shape
			
 
				-    assert np.allclose(y_exp, y, atol=4e-3) # reduce? error is from standardization
			
 
				+    assert np.allclose(y_exp, y, atol=4e-3)  # reduce? error is from standardization
			
 
				 
			
 
				 
			
 
				 def test_causal_attention_mask(ctx: Ctx):
			
@@ -600,8 +639,11 @@ def test_t2tt(ctx: Ctx, g_model: c_void_p):
 
				         # The score error is big, this may negatively impact the beam search.
			
 
				         assert np.allclose(g_step_scores, exp["step_scores"], atol=0.1)
			
 
				 
			
 
				+
			
 
				 def test_s2tt(ctx: Ctx, g_model: c_void_p):
			
 
				-    src_audio_wav, _ = torchaudio.load("/private/home/dnn/internal_sc/seamless_communication/ggml/examples/unity/test.wav")
			
 
				+    src_audio_wav, _ = torchaudio.load(
			
 
				+        "/private/home/dnn/internal_sc/seamless_communication/ggml/examples/unity/test.wav"
			
 
				+    )
			
 
				     # translator = load_translator()
			
 
				     # token_encoder = translator.text_tokenizer.create_encoder(
			
 
				     #     task="translation"
			
@@ -628,9 +670,23 @@ def test_s2tt(ctx: Ctx, g_model: c_void_p):
 
				     # tgt_tokens = text_out.generator_output.results[0][0].seq
			
 
				     # score = text_out.generator_output.results[0][0].score.item()
			
 
				 
			
 
				-    tgt_tokens = [     3, 256200,  16991, 249346, 249725,    146,  25220, 251069, 249211,
			
 
				-        251148, 253935,      3] # "大家好 , 世界无主题。"
			
 
				-    gx = ggml.from_numpy(ctx, src_audio_wav * 2**15) # Apply scale before sending into ggml!
			
 
				+    tgt_tokens = [
			
 
				+        3,
			
 
				+        256200,
			
 
				+        16991,
			
 
				+        249346,
			
 
				+        249725,
			
 
				+        146,
			
 
				+        25220,
			
 
				+        251069,
			
 
				+        249211,
			
 
				+        251148,
			
 
				+        253935,
			
 
				+        3,
			
 
				+    ]  # "大家好 , 世界无主题。"
			
 
				+    gx = ggml.from_numpy(
			
 
				+        ctx, src_audio_wav * 2**15
			
 
				+    )  # Apply scale before sending into ggml!
			
 
				     ggml.ggml_set_name(gx, b"x")
			
 
				     gy = ggml.forward(
			
 
				         "StandardConformerEncoder",
			
@@ -659,8 +715,6 @@ def test_s2tt(ctx: Ctx, g_model: c_void_p):
 
				     job.bos_idx = 2
			
 
				     job.eos_idx = 3
			
 
				 
			
 
				-    result_ptr = ggml.generate_sequence(
			
 
				-        g_model, job, encoder_out, None, ctx
			
 
				-    )
			
 
				+    result_ptr = ggml.generate_sequence(g_model, job, encoder_out, None, ctx)
			
 
				     g_tokens = list(ggml.to_numpy(result_ptr[0].seq))
			
 
				     assert g_tokens == tgt_tokens