浏览代码

WIP: simple failing test case

Guillaume Wenzek 1 年之前
父节点
当前提交
522b97234e
共有 5 个文件被更改,包括 165 次插入60 次删除
  1. 4 4
      ggml/ctypes_utils.py
  2. 68 21
      ggml/examples/unity/fairseq2.cpp
  3. 1 1
      ggml/examples/unity/fairseq2.h
  4. 4 0
      ggml/ggml.py
  5. 88 34
      ggml/test_unity_cpp.py

+ 4 - 4
ggml/ctypes_utils.py

@@ -32,9 +32,11 @@ def _py_type_to_ctype(t: type):
         raise ValueError(
             f"Type parsing of '{t}' isn't supported, you need to provide a real type annotation."
         )
-    if t.__module__ == "ctypes":
-        return t
+    if t is None:
+        return None
     if isinstance(t, type):
+        if t.__module__ == "ctypes":
+            return t
         if issubclass(t, ctypes.Structure):
             return t
         if issubclass(t, ctypes._Pointer):
@@ -47,8 +49,6 @@ def _py_type_to_ctype(t: type):
         return ctypes.c_bool
     if t is str:
         return ctypes.c_char_p
-    if t is None:
-        return None
 
     if getattr(t, "__origin__", None) is Ptr:
         pointee = _py_type_to_ctype(t.__args__[0])

+ 68 - 21
ggml/examples/unity/fairseq2.cpp

@@ -8,6 +8,11 @@
 #include <iostream>
 #include <fnmatch.h>
 
+void ggml_detach(ggml_tensor* a) {
+    a->op = GGML_OP_NONE;
+    std::fill(a->src, a->src + GGML_MAX_SRC, nullptr);
+}
+
 /// allocate the fairseq2 model and hyperparameters
 extern "C" fairseq2_model* fairseq2_model_alloc() {
     // pre-allocate some memory to write hyperparameters and tensors pointers
@@ -16,11 +21,14 @@ extern "C" fairseq2_model* fairseq2_model_alloc() {
     return model;
 }
 
-void fairseq2_kv_cache_alloc(const fairseq2_model& model, std::size_t beam_size, std::size_t max_seq_len) {
+extern "C" void fairseq2_kv_cache_alloc(const fairseq2_model& model, int beam_size, int max_seq_len) {
     // Note: we only allocate the cache for the decoder attention.
     // For encoder attention since we compute it all at once,
     // the allocation is delayed to the first forward pass, to not over allocate.
     auto layer_glob_c = "*decoder.*attn.k_proj.weight";
+    ggml_tensor* self_attn_mask = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, max_seq_len, max_seq_len);
+    self_attn_mask = ggml_diag_mask_inf(model.ctx, self_attn_mask, 0);
+
     for (auto named_tensor : model.tensors) {
         const std::string& name = named_tensor.first;
         if (::fnmatch(layer_glob_c, name.c_str(), 0) == FNM_NOMATCH)
@@ -31,6 +39,7 @@ void fairseq2_kv_cache_alloc(const fairseq2_model& model, std::size_t beam_size,
         model.kv_cache[name.substr(0, name.size() - 14)] = KeyValueTensor {
             ggml_new_tensor_3d(model.ctx, k_proj->type, model_dim, max_seq_len, beam_size),
             ggml_new_tensor_3d(model.ctx, k_proj->type, model_dim, max_seq_len, beam_size),
+            self_attn_mask,
             0,
         };
     }
@@ -43,21 +52,67 @@ bool has_kv_cache(const fairseq2_model& model) {
 // copy k and v to kv cache
 // kv.full_k[step_nr] = k;
 // kv.full_v[step_nr] = v;
-void append_to_prev_kv(const fairseq2_model& model, const std::string& prefix, ggml_tensor** k, ggml_tensor** v) {
+void append_to_prev_kv(const fairseq2_model& model, const std::string& prefix, ggml_tensor** k, ggml_tensor** v, ggml_tensor** self_attn_mask) {
     KeyValueTensor& kv = model.kv_cache[prefix];
     int step_nr = kv.step_nr;
 
     ggml_tensor* full_k = kv.full_k;
     ggml_tensor* full_v = kv.full_v;
 
+    // (N, S_kv, K_proj)
+    GGML_ASSERT((*k)->ne[1] == 1);  // TODO I think we could handle adding a full prefix sequence
     ggml_tensor* updated_k = ggml_set_2d_inplace(model.ctx, full_k, *k, full_k->nb[2], full_k->nb[1] * step_nr);
     ggml_tensor* updated_v = ggml_set_2d_inplace(model.ctx, full_v, *v, full_v->nb[2], full_v->nb[1] * step_nr);
 
     *k = ggml_slice(model.ctx, updated_k, 1, 0, step_nr + 1);
     *v = ggml_slice(model.ctx, updated_v, 1, 0, step_nr + 1);
+
+    // qk is (B * H, Sq, Sk) == (B*H, 1, Sk) in incremental mode
+    // we return the Sq slice of the (Sq, Sk) attention mask
+    *self_attn_mask = ggml_slice(
+        model.ctx,
+        ggml_slice(model.ctx, kv.self_attn_mask, 0, 0, step_nr + 1),
+        1,
+        step_nr,
+        step_nr + 1
+    );
+
     kv.step_nr = step_nr + 1;
 }
 
+// variant of ggml_get_rows that allows for a with more than 2 dims.
+ggml_tensor* ggml_get_rows2(ggml_context* ctx, ggml_tensor* a, ggml_tensor* b) {
+    int flattened = 0;
+    GGML_ASSERT(a->n_dims <= 3);
+    if (a->n_dims == 3) {
+        flattened = a->ne[0];
+        a = ggml_flatten_1d(ctx, a, 0);
+    }
+    a = ggml_get_rows(ctx, a, b);
+    if (flattened) {
+        a = ggml_unflatten_1d(ctx, a, 0, flattened);
+    }
+    return a;
+}
+
+
+void _reorder_kv_cache(ggml_context* ctx, ggml_cgraph* gf, KeyValueTensor& kv, ggml_tensor* new_order) {
+    ggml_detach(kv.full_k);
+    kv.full_k = ggml_get_rows2(ctx, kv.full_k, new_order);
+    ggml_build_forward_expand(gf, kv.full_k);
+
+    ggml_detach(kv.full_v);
+    kv.full_v = ggml_get_rows2(ctx, kv.full_v, new_order);
+    ggml_build_forward_expand(gf, kv.full_v);
+}
+
+
+void reorder_kv_cache(const fairseq2_model& model, ggml_cgraph* gf, ggml_tensor* new_order) {
+    ggml_context* ctx = model.ctx;
+    for (auto& named_kv : model.kv_cache) {
+        _reorder_kv_cache(ctx, gf, named_kv.second, new_order);
+    }
+}
 
 
 inline double model_layer_config_d(const fairseq2_model& model, std::string name) {
@@ -281,18 +336,18 @@ extern "C" ggml_tensor* MultiheadAttention_forward(
                 ggml_set_name(k, "k");
                 v = Linear_forward(model, prefix + ".v_proj", values);
                 ggml_set_name(v, "v");
-                model.kv_cache[prefix] = KeyValueTensor{k, v, 1};
+                model.kv_cache[prefix] = KeyValueTensor{k, v, nullptr, 1};
             } else {
                 k = kv_cache.full_k;
                 v = kv_cache.full_v;
             }
-        } else {
+        } else { // self attention
             // (1, K) -> (N, 1, K_proj)
             k = Linear_forward(model, prefix + ".k_proj", keys);
             // (1, V) -> (N, 1, V_proj)
             v = Linear_forward(model, prefix + ".v_proj", values);
 
-            append_to_prev_kv(model, prefix, &k, &v);
+            append_to_prev_kv(model, prefix, &k, &v, &attn_mask);
         }
     }
     k = _reshape_num_head(ctx, k, head_dim);  // (B * H, Sk, H_dim)
@@ -315,7 +370,7 @@ extern "C" ggml_tensor* MultiheadAttention_forward(
     ggml_set_name(qk, "qk_scaled");
 
     // TODO: Should we replace this by ggml_diag_mask_inf ?
-    if (attn_mask) qk = ggml_add(ctx, qk, attn_mask);
+    if (attn_mask) qk = ggml_add_inplace(ctx, qk, attn_mask);
     // TODO: upgrade qk to float32 if needed
     ggml_tensor* attn_weights = ggml_soft_max(ctx, qk);  // (B * H, S, Sk)
     ggml_set_name(attn_weights, "attn_weights");
@@ -992,7 +1047,7 @@ ggml_tensor* ggml_expand_2d(ggml_context* ctx, ggml_tensor* x, int64_t ne0, int6
     return y;
 }
 
-void _bootstrap_seqs_and_scores(
+extern "C" void _bootstrap_seqs_and_scores(
     fairseq2_model& model,
     const SequenceGeneratorJob& job,
     ggml_tensor* full_seqs,
@@ -1078,14 +1133,6 @@ int topk(
 }
 
 
-void ggml_detach(ggml_tensor* a) {
-    a->op = GGML_OP_NONE;
-    for (int i = 0; i < GGML_MAX_SRC; ++i) {
-        a->src[i] = nullptr;
-    }
-}
-
-
 /// Copies the sequence and scores of a given candidate beam.
 void _finalize_hypothesis(
     const SequenceGeneratorJob& job,
@@ -1303,14 +1350,14 @@ extern "C" Hypothesis* generate_sequence(
             new_seqs->type = GGML_TYPE_F32;
             new_seqs = ggml_get_rows(ctx, seqs, beam_indices);
             new_scores = ggml_get_rows(ctx, scores, beam_indices);
-            gf = ggml_build_forward(new_seqs);
-            ggml_graph_compute_with_ctx(ctx, &gf, 1);
-            ggml_detach(new_seqs);
-            new_seqs->type = GGML_TYPE_I32;
+            ggml_cgraph gf_reorder = ggml_build_forward(new_seqs);
+            ggml_build_forward_expand(&gf_reorder, new_scores);
+            reorder_kv_cache(model, &gf_reorder, beam_indices);
 
-            gf = ggml_build_forward(new_scores);
-            ggml_graph_compute_with_ctx(ctx, &gf, 1);
+            ggml_graph_compute_with_ctx(ctx, &gf_reorder, 1);
+            ggml_detach(new_seqs);
             ggml_detach(new_scores);
+            new_seqs->type = GGML_TYPE_I32;
         }
         
         // new_seqs[:, step_nr + 1] = next_tokens

+ 1 - 1
ggml/examples/unity/fairseq2.h

@@ -9,8 +9,8 @@
 struct KeyValueTensor {
     ggml_tensor* full_k;
     ggml_tensor* full_v;
+    ggml_tensor* self_attn_mask;
     int step_nr;
-    // ggml_tensor* key_padding_mask;
 };
 
 struct fairseq2_model {

+ 4 - 0
ggml/ggml.py

@@ -453,4 +453,8 @@ def _testing_return_hypothesis_ptr(ctx: ggml_context_p) -> Ptr[Hypothesis]:
 
 @c_fn(lib)
 def fairseq2_model_layer_config_int(model: ctypes.c_void_p, name: str) -> int:
+    return -1
+
+@c_fn(lib)
+def fairseq2_kv_cache_alloc(model: ctypes.c_void_p, beam_size: int, max_seq_len: int) -> None:
     pass

+ 88 - 34
ggml/test_unity_cpp.py

@@ -22,6 +22,7 @@ from seamless_communication.models.inference.translator import Translator, Modal
 from fairseq2.data.audio import WaveformToFbankConverter
 import torchaudio
 from fairseq2.models.wav2vec2.feature_extractor import Wav2Vec2FbankFeatureExtractor
+
 Ctx = ggml.ggml_context_p
 
 UNITY_MODELS = Path(__file__).parent / "examples/unity/models"
@@ -241,6 +242,42 @@ def test_MultiheadAttention_forward(
     assert np.allclose(y_exp, y, atol=1e-2 if naive_attn else 1e-4)
 
 
+def test_MultiheadAttention_forward_with_state_bag(ctx: Ctx, g_model: c_void_p) -> None:
+    pt_model = load_pt_model()
+    self_attn = pt_model.text_encoder.layers[0].self_attn
+
+    x = torch.empty((2, 21, 1024))
+    torch.random.manual_seed(0)
+    torch.nn.init.uniform_(x, -1, 1)
+
+    state_bag = fairseq2.nn.IncrementalStateBag()
+
+    ggml.fairseq2_kv_cache_alloc(g_model, 2, 21)
+    # Incremental decoding
+    for t in range(3):
+        xq, xk = x[:, t : t + 1], x[:, t : t + 1]
+        y_exp = self_attn(xq, None, xk, xk, state_bag=state_bag).numpy()
+        assert y_exp.shape == (2, 1, 1024)
+
+        gxq = ggml.from_numpy(ctx, xq.contiguous())
+        gxk = ggml.from_numpy(ctx, xk.contiguous())
+        ggml.ggml_set_name(gxk, b"xk")
+        gy = ggml.forward(
+            "MultiheadAttention",
+            g_model,
+            "text_encoder.layers.0.self_attn",
+            gxq,
+            gxk,
+            gxk,
+            None,  # type: ignore
+        )
+        gf = ggml.ggml_build_forward(gy)
+        ggml.ggml_graph_compute_with_ctx(ctx, ctypes.pointer(gf), 1)
+
+        y = ggml.to_numpy(gy)
+        assert np.allclose(y, y_exp, atol=1e-2)
+
+
 def test_StandardTransformerEncoderLayer_forward(ctx: Ctx, g_model: c_void_p) -> None:
     x = torch.empty((2, 21, 1024))
     padding_mask = torch.ones((2, 21))
@@ -272,11 +309,12 @@ def test_StandardTransformerEncoderLayer_forward(ctx: Ctx, g_model: c_void_p) ->
     assert y.shape == y_exp.shape
     assert np.allclose(y_exp, y, atol=1e-4 if UNITY_FLASH_ATTN else 1e-2)
 
-def test_StandardConformerEncoderLayer_forward(
-    ctx: Ctx, g_model: c_void_p
-) -> None:
+
+def test_StandardConformerEncoderLayer_forward(ctx: Ctx, g_model: c_void_p) -> None:
     pt_model = load_pt_model()
-    x = torch.load("/private/home/dnn/internal_sc/seamless_communication/ggml/examples/unity/dev/seqs_before_conformer_block.pt")
+    x = torch.load(
+        "/private/home/dnn/internal_sc/seamless_communication/ggml/examples/unity/dev/seqs_before_conformer_block.pt"
+    )
     padding_mask = torch.ones((1, x.shape[1]))
     layer = pt_model.speech_encoder.inner.layers[0]
     gx = ggml.from_numpy(ctx, x[0])
@@ -304,7 +342,9 @@ def test_StandardConformerEncoderAdaptorLayer_forward(
     ctx: Ctx, g_model: c_void_p
 ) -> None:
     pt_model = load_pt_model()
-    x = torch.load("/private/home/dnn/internal_sc/seamless_communication/ggml/examples/unity/dev/seqs_before_adaptor.pt")
+    x = torch.load(
+        "/private/home/dnn/internal_sc/seamless_communication/ggml/examples/unity/dev/seqs_before_adaptor.pt"
+    )
     layer = pt_model.speech_encoder.adaptor_layers[0]
     gx = ggml.from_numpy(ctx, x[0])
     ggml.ggml_set_name(gx, b"x")
@@ -356,12 +396,13 @@ def test_StandardTransformerEncoder_forward(ctx: Ctx, g_model: c_void_p) -> None
     assert y.shape == y_exp.shape
     assert np.allclose(y_exp, y, atol=1e-4)
 
-def test_StandardConformerEncoder_forward(
-    ctx: Ctx, g_model: c_void_p
-) -> None:
+
+def test_StandardConformerEncoder_forward(ctx: Ctx, g_model: c_void_p) -> None:
     pt_model = load_pt_model()
-    wav, _ = torchaudio.load("/private/home/dnn/internal_sc/seamless_communication/ggml/examples/unity/test.wav")
-    gx = ggml.from_numpy(ctx, wav * 2**15) # Apply scale before sending into ggml!
+    wav, _ = torchaudio.load(
+        "/private/home/dnn/internal_sc/seamless_communication/ggml/examples/unity/test.wav"
+    )
+    gx = ggml.from_numpy(ctx, wav * 2**15)  # Apply scale before sending into ggml!
     ggml.ggml_set_name(gx, b"x")
     gy = ggml.forward(
         "StandardConformerEncoder",
@@ -381,24 +422,25 @@ def test_StandardConformerEncoder_forward(
     )
     converter_input = {
         "waveform": wav.transpose(0, 1),
-        "sample_rate": 16000.,
+        "sample_rate": 16000.0,
         "format": -1,
     }
 
     y = ggml.to_numpy(gy)
-    speech_encoder_input = pt_model.speech_encoder_frontend(converter(converter_input)["fbank"].unsqueeze(0), None)[0]
+    speech_encoder_input = pt_model.speech_encoder_frontend(
+        converter(converter_input)["fbank"].unsqueeze(0), None
+    )[0]
 
     y_exp, _ = pt_model.speech_encoder(speech_encoder_input, None)
     y_exp = y_exp.numpy()  # remove batch dimension
 
     assert y.shape == y_exp.shape
-    assert np.allclose(y_exp, y, atol=1e-2) # There are 10 elements in a 137*1024 tensor with error >1e-2
-
+    assert np.allclose(
+        y_exp, y, atol=1e-2
+    )  # There are 10 elements in a 137*1024 tensor with error >1e-2
 
 
-def test_WaveformToFbank_forward(
-    ctx: Ctx, g_model: c_void_p
-) -> None:
+def test_WaveformToFbank_forward(ctx: Ctx, g_model: c_void_p) -> None:
     pt_model = load_pt_model()
     converter = WaveformToFbankConverter(
         num_mel_bins=80,
@@ -407,30 +449,27 @@ def test_WaveformToFbank_forward(
         standardize=True,
     )
     extractor = Wav2Vec2FbankFeatureExtractor(80, 2, 1)
-    wav, _ = torchaudio.load("/private/home/dnn/internal_sc/seamless_communication/ggml/examples/unity/test.wav")
-    gx = ggml.from_numpy(ctx, wav * 2**15) # Apply scale before sending into ggml!
+    wav, _ = torchaudio.load(
+        "/private/home/dnn/internal_sc/seamless_communication/ggml/examples/unity/test.wav"
+    )
+    gx = ggml.from_numpy(ctx, wav * 2**15)  # Apply scale before sending into ggml!
     ggml.ggml_set_name(gx, b"x")
 
-    gy = ggml.forward(
-        "WaveformToFbank",
-        g_model,
-        "",
-        gx
-    )
+    gy = ggml.forward("WaveformToFbank", g_model, "", gx)
     gf = ggml.ggml_build_forward(gy)
     ggml.ggml_graph_compute_with_ctx(ctx, ctypes.pointer(gf), 1)
 
     y = ggml.to_numpy(gy)
     converter_input = {
         "waveform": wav.transpose(0, 1),
-        "sample_rate": 16000.,
+        "sample_rate": 16000.0,
         "format": -1,
     }
     y_exp = extractor(converter(converter_input)["fbank"].unsqueeze(0), None)[0]
     y_exp = y_exp.numpy()
 
     assert y.shape == y_exp.shape
-    assert np.allclose(y_exp, y, atol=4e-3) # reduce? error is from standardization
+    assert np.allclose(y_exp, y, atol=4e-3)  # reduce? error is from standardization
 
 
 def test_causal_attention_mask(ctx: Ctx):
@@ -600,8 +639,11 @@ def test_t2tt(ctx: Ctx, g_model: c_void_p):
         # The score error is big, this may negatively impact the beam search.
         assert np.allclose(g_step_scores, exp["step_scores"], atol=0.1)
 
+
 def test_s2tt(ctx: Ctx, g_model: c_void_p):
-    src_audio_wav, _ = torchaudio.load("/private/home/dnn/internal_sc/seamless_communication/ggml/examples/unity/test.wav")
+    src_audio_wav, _ = torchaudio.load(
+        "/private/home/dnn/internal_sc/seamless_communication/ggml/examples/unity/test.wav"
+    )
     # translator = load_translator()
     # token_encoder = translator.text_tokenizer.create_encoder(
     #     task="translation"
@@ -628,9 +670,23 @@ def test_s2tt(ctx: Ctx, g_model: c_void_p):
     # tgt_tokens = text_out.generator_output.results[0][0].seq
     # score = text_out.generator_output.results[0][0].score.item()
 
-    tgt_tokens = [     3, 256200,  16991, 249346, 249725,    146,  25220, 251069, 249211,
-        251148, 253935,      3] # "大家好 , 世界无主题。"
-    gx = ggml.from_numpy(ctx, src_audio_wav * 2**15) # Apply scale before sending into ggml!
+    tgt_tokens = [
+        3,
+        256200,
+        16991,
+        249346,
+        249725,
+        146,
+        25220,
+        251069,
+        249211,
+        251148,
+        253935,
+        3,
+    ]  # "大家好 , 世界无主题。"
+    gx = ggml.from_numpy(
+        ctx, src_audio_wav * 2**15
+    )  # Apply scale before sending into ggml!
     ggml.ggml_set_name(gx, b"x")
     gy = ggml.forward(
         "StandardConformerEncoder",
@@ -659,8 +715,6 @@ def test_s2tt(ctx: Ctx, g_model: c_void_p):
     job.bos_idx = 2
     job.eos_idx = 3
 
-    result_ptr = ggml.generate_sequence(
-        g_model, job, encoder_out, None, ctx
-    )
+    result_ptr = ggml.generate_sequence(g_model, job, encoder_out, None, ctx)
     g_tokens = list(ggml.to_numpy(result_ptr[0].seq))
     assert g_tokens == tgt_tokens