2 years ago · fa85f05545
--- a/ggml/examples/unity/fairseq2.cpp
+++ b/ggml/examples/unity/fairseq2.cpp
@@ -7,15 +7,21 @@ extern "C" fairseq2_model* fairseq2_model_alloc() {
 
				     auto* model = new fairseq2_model;
			
 
				     model->hparams = new std::uint8_t[8 * 1024];
			
 
				     model->arch = new std::uint64_t[16 * 1024];  // max tensors allowed
			
 
				+    model->tensors_ctx = nullptr;
			
 
				     return model;
			
 
				 };
			
 
				 
			
 
				 extern "C" void fairseq2_model_free(fairseq2_model* model) {
			
 
				+    if (model->tensors_ctx) ggml_free(model->tensors_ctx);
			
 
				     delete (std::uint64_t*)(model->arch);
			
 
				     delete (std::uint8_t*)model->hparams;
			
 
				     delete model;
			
 
				 };
			
 
				 
			
 
				+extern "C" void fairseq2_model_set_inference_ctx(fairseq2_model* model, ggml_context* ctx) {
			
 
				+    model->ctx = ctx;
			
 
				+}
			
 
				+
			
 
				 extern "C" std::string* std_string_alloc(char* c_str) {
			
 
				     return new std::string(c_str);
			
 
				 }
			
@@ -163,43 +169,68 @@ void MultiheadAttention_init(
 
				 }
			
 
				 
			
 
				 ggml_tensor* reshape_num_head(ggml_context* ctx, ggml_tensor* x, int num_heads) {
			
 
				-    int slen = x->ne[0];
			
 
				-    // (S, M) -> (S, K_proj)
			
 
				-    x = ggml_reshape_3d(ctx, x, slen, num_heads, x->ne[1] / num_heads);
			
 
				-    // (S, K_proj) -> (H, S, K_h)
			
 
				-    return ggml_transpose(ctx, x);
			
 
				+    int slen = x->ne[1];
			
 
				+    int model_dim = x->ne[0];
			
 
				+    // (S, dim) -> (S, H, H_dim)
			
 
				+    x = ggml_reshape_3d(ctx, x, model_dim / num_heads, num_heads, slen);
			
 
				+    // (S, H, H_dim) -> (H, S, H_dim)
			
 
				+    x = ggml_permute(ctx, x, 0, 2, 1, 3);
			
 
				+    return x;
			
 
				 }
			
 
				 
			
 
				 
			
 
				 
			
 
				-extern "C" ggml_tensor* // (d_in, seq_len)
			
 
				+extern "C" ggml_tensor* // (slen, d_in)
			
 
				 MultiheadAttention_forward(
			
 
				     fairseq2_model& model,
			
 
				     const std::string &prefix,
			
 
				-    ggml_tensor* queries,  // (d_in, len_q)
			
 
				-    ggml_tensor* keys,  // (d_in, len_k)
			
 
				-    ggml_tensor* values,  // (d_out, len_k)
			
 
				-    ggml_tensor* mask // (seq_len, len_q)
			
 
				+    ggml_tensor* queries,  // (slen, d_in)
			
 
				+    ggml_tensor* keys,  // (klen, d_in)
			
 
				+    ggml_tensor* values,  // (klen, d_out)
			
 
				+    ggml_tensor* _ // (klen, slen)  TODO: do we need to pass mask here ?
			
 
				 ) {
			
 
				+    int slen = queries->ne[1];
			
 
				     int num_heads = 16;
			
 
				+    int head_dim = queries->ne[0] / num_heads;
			
 
				     ggml_context* ctx = model.ctx;
			
 
				     ggml_tensor* q = Linear_forward(model, prefix + ".q_proj", queries);
			
 
				-    q = reshape_num_head(ctx, q, num_heads);
			
 
				+    q = reshape_num_head(ctx, q, num_heads);  // (H, S, H_dim)
			
 
				     ggml_tensor* k = Linear_forward(model, prefix + ".k_proj", keys);
			
 
				-    k = reshape_num_head(ctx, k, num_heads);
			
 
				-    ggml_tensor* v = Linear_forward(model, prefix + ".q_proj", queries);
			
 
				-    v = reshape_num_head(ctx, v, num_heads);
			
 
				+    k = reshape_num_head(ctx, k, num_heads);  // (H, S, H_dim)
			
 
				+    ggml_tensor* v = Linear_forward(model, prefix + ".v_proj", values);
			
 
				+    v = ggml_reshape_3d(ctx, v, head_dim, num_heads, slen); // (S, H, H_dim)
			
 
				+    // v = ggml_permute(ctx, v, 1, 2, 0, 3);  // (H, H_dim, S)
			
 
				+    v = ggml_permute(ctx, v, 1, 0, 2, 3);  // (S, H_dim, H)
			
 
				+    v = ggml_cont(ctx, v);
			
 
				+
			
 
				+    // ggml_tensor* attn = ggml_flash_attn(ctx, q, k, v, /*masked*/false);  // (H, S, H_dim)
			
 
				+    attn = ggml_permute(ctx, attn, 0, 2, 1, 3);  // (S, H, H_dim)
			
 
				+    attn = ggml_cont(ctx, attn);
			
 
				+    attn = ggml_reshape_2d(ctx, attn, num_heads * head_dim, slen);   // (S, H * V_h)
			
 
				+    attn = Linear_forward(model, prefix + ".output_proj", attn);              // (S, d_out)
			
 
				 
			
 
				-    ggml_tensor* attn = ggml_flash_attn(model.ctx, q, k, v, /*masked*/true);
			
 
				-    attn = Linear_forward(model, prefix + ".output_proj", attn);
			
 
				     return attn;
			
 
				-    // ggml_tensor* attn = SDPA_forward(q, k, v, nullptr);
			
 
				-    // // (H, S, V_h) -> (S, H, V_h)
			
 
				-    // attn = ggml_transpose(ctx, attn);
			
 
				-    // // (S, H, V_h) -> (S, V_proj)
			
 
				-    // attn = ggml_reshape_3d()
			
 
				 }
			
 
				 
			
 
				+// ggml_tensor* attn_weights = ggml_mul_mat(ctx, q, k);  // (H, S, S)
			
 
				+//     attn_weights = ggm_mul * (q.size(-1) ** -0.5)
			
 
				+
			
 
				+//     if mask is not None:
			
 
				+//         attn_weights = attn_weights + mask
			
 
				+
			
 
				+//     # For numerical stability run in single precision.
			
 
				+//     attn_weights = softmax(attn_weights, dim=-1, dtype=torch.float32)
			
 
				+
			
 
				+//     attn_weights = attn_weights.type_as(q)
			
 
				+
			
 
				+//     if training and dropout_p > 0.0:
			
 
				+//         attn_weights = dropout(attn_weights, dropout_p)
			
 
				+
			
 
				+//     # (*, S, S_kv) @ (*, S_kv, V) = (*, S, V)
			
 
				+//     attn = torch.matmul(attn_weights, values)
			
 
				+
			
 
				+//     return attn, attn_weights if needs_weights else None
			
 
				+
			
 
				 // extern "C" ggml_tensor* // (d_out, seq_len)
			
 
				 // SDPA_forward(
			
 
				 //     fairseq2_model& model,
			
--- a/ggml/examples/unity/fairseq2.h
+++ b/ggml/examples/unity/fairseq2.h
@@ -7,15 +7,22 @@
 
				 
			
 
				 
			
 
				 struct fairseq2_model {
			
 
				-    ggml_context* ctx;
			
 
				+    // Context containing all tensors memory
			
 
				+    ggml_context* tensors_ctx;
			
 
				+    // Named tensors, all tensors should belong to tensors_ctx
			
 
				     std::map<std::string, struct ggml_tensor *> tensors;
			
 
				     void* arch;
			
 
				     void* hparams;
			
 
				+    // an inference context, not managed by this object
			
 
				+    // TODO: is this the best place to store this or should we also pass this to all forward methods ?
			
 
				+    ggml_context* ctx;
			
 
				 };
			
 
				 
			
 
				 /// allocate the fairseq2 model and hyperparameters
			
 
				 extern "C" fairseq2_model* fairseq2_model_alloc();
			
 
				+// free the models and all its owned tensors
			
 
				 extern "C" void fairseq2_model_free(fairseq2_model* model);
			
 
				+extern "C" void fairseq2_model_set_inference_ctx(fairseq2_model* model, ggml_context* ctx);
			
 
				 
			
 
				 extern "C" std::string* std_string_alloc(char* c_str);
			
 
				 extern "C" void std_string_free(std::string* str);
			
--- a/ggml/examples/unity/model_loader.cpp
+++ b/ggml/examples/unity/model_loader.cpp
@@ -29,7 +29,7 @@ model_loader::load_model_weights(fairseq2_model &model, std::ifstream &fin)
 
				         std::string name = get_name(fin);
			
 
				         if (name.length() == 0)
			
 
				             break;
			
 
				-        auto tensor = load_tensor_value(fin, model.ctx);
			
 
				+        auto tensor = load_tensor_value(fin, model.tensors_ctx);
			
 
				         if (tensor == nullptr) {
			
 
				             // Abort in case of error, the input stream is corrupted at this point.
			
 
				             printf("Error while reading tensor %s\n", name.c_str() );
			
--- a/ggml/examples/unity/model_loader.h
+++ b/ggml/examples/unity/model_loader.h
@@ -51,8 +51,8 @@ int load_fairseq2_ggml_file(fairseq2_model& model, const char* fname) {
 
				         /*.mem_buffer =*/ NULL,
			
 
				         /*.no_alloc   =*/ false,
			
 
				     };
			
 
				-    model.ctx = ggml_init(params);
			
 
				+    model.tensors_ctx = ggml_init(params);
			
 
				 
			
 
				-    return loader.load_model_weights(model, fin);;
			
 
				+    return loader.load_model_weights(model, fin);
			
 
				 }
			
 
				 
			
--- a/ggml/ggml.py
+++ b/ggml/ggml.py
@@ -99,7 +99,7 @@ def _shape_to_ne(shape: Tuple[int, ...]) -> Tuple[int, int, int, int]:
 
				     # in GGML ne[0] indicates the contiguous dimension, ie the last one in numpy and torch
			
 
				     ne = shape[::-1]
			
 
				     if len(ne) >= GGML_MAX_DIMS:
			
 
				-        return   # type: ignore
			
 
				+        return  # type: ignore
			
 
				 
			
 
				     # ne is always of the same length
			
 
				     padding = (1,) * (GGML_MAX_DIMS - len(ne))
			
@@ -218,6 +218,9 @@ def GptVocab() -> NativeObj:
 
				     return NativeObj("gpt_vocab")
			
 
				 
			
 
				 
			
 
				+lib.fairseq2_model_set_inference_ctx.argtypes = [ctypes.c_void_p, ggml_context_p]
			
 
				+
			
 
				+
			
 
				 def Fairseq2Model() -> NativeObj:
			
 
				     return NativeObj("fairseq2_model")
			
 
				 
			
@@ -290,7 +293,7 @@ _FORWARD_CACHE: Dict[str, Callable[..., ggml_tensor_p]] = {}
 
				 
			
 
				 
			
 
				 def forward(
			
 
				-    layer_name: str, model: NativeObj, prefix: str, *inputs: ggml_tensor_p
			
 
				+    layer_name: str, model: ctypes.c_void_p, prefix: str, *inputs: ggml_tensor_p
			
 
				 ) -> ggml_tensor_p:
			
 
				     fwd: Any = _FORWARD_CACHE.get(layer_name)
			
 
				     if fwd is None:
			
@@ -303,4 +306,4 @@ def forward(
 
				         _FORWARD_CACHE[layer_name] = fwd
			
 
				 
			
 
				     with CppStr(prefix) as std_prefix:
			
 
				-        return fwd(model.ptr, std_prefix, *inputs)  # ignore: type[no-any-return]
			
 
				+        return fwd(model, std_prefix, *inputs)  # ignore: type[no-any-return]
			
--- a/ggml/include/ggml/ggml.h
+++ b/ggml/include/ggml/ggml.h
@@ -347,7 +347,7 @@ extern "C" {
 
				         GGML_OP_NONE = 0,
			
 
				 
			
 
				         GGML_OP_DUP,
			
 
				-        GGML_OP_ADD,
			
 
				+        GGML_OP_ADD, //2
			
 
				         GGML_OP_ADD1,
			
 
				         GGML_OP_ACC,
			
 
				         GGML_OP_SUB,
			
@@ -369,16 +369,15 @@ extern "C" {
 
				         GGML_OP_RMS_NORM_BACK,
			
 
				         GGML_OP_GROUP_NORM,
			
 
				 
			
 
				-        GGML_OP_MUL_MAT,
			
 
				+        GGML_OP_MUL_MAT, //23
			
 
				         GGML_OP_OUT_PROD,
			
 
				-
			
 
				         GGML_OP_SCALE,
			
 
				         GGML_OP_SET,
			
 
				         GGML_OP_CPY,
			
 
				         GGML_OP_CONT,
			
 
				-        GGML_OP_RESHAPE,
			
 
				+        GGML_OP_RESHAPE, //29
			
 
				         GGML_OP_VIEW,
			
 
				-        GGML_OP_PERMUTE,
			
 
				+        GGML_OP_PERMUTE, //32
			
 
				         GGML_OP_TRANSPOSE,
			
 
				         GGML_OP_GET_ROWS,
			
 
				         GGML_OP_GET_ROWS_BACK,
			
--- a/ggml/test_unity_cpp.py
+++ b/ggml/test_unity_cpp.py
@@ -52,6 +52,7 @@ def test_ggml_bindings_work(ctx: Ctx) -> None:
 
				     output = ggml.ggml_get_f32_1d(f, 0)
			
 
				     assert output == 16.0
			
 
				 
			
 
				+
			
 
				 def test_ggml_matmul(ctx: Ctx) -> None:
			
 
				     # Instantiate tensors
			
 
				     a = ggml.ggml_new_tensor_2d(ctx, ggml.GGML_TYPE_F32, 4, 2)
			
@@ -67,7 +68,6 @@ def test_ggml_matmul(ctx: Ctx) -> None:
 
				     for i in range(4 * 3):
			
 
				         ggml.ggml_set_f32_1d(x, i, i)
			
 
				 
			
 
				-
			
 
				     ggml.ggml_set_f32(a, 0.0)
			
 
				     ggml.ggml_set_f32_1d(a, 1, 1.0)
			
 
				     ggml.ggml_set_f32_1d(a, 7, 1.0)
			
@@ -129,11 +129,13 @@ def test_to_numpy_works_with_f32(ctx: Ctx) -> None:
 
				     # assert nb.shape == (21, 11)
			
 
				     assert nb[0, 5] == 5
			
 
				     assert nb[3, 5] == 11 * 3 + 5
			
 
				-    assert np.allclose(nb, np.array(range(11 * 21), dtype=np.float32).reshape(ggml.shape(b)))
			
 
				+    assert np.allclose(
			
 
				+        nb, np.array(range(11 * 21), dtype=np.float32).reshape(ggml.shape(b))
			
 
				+    )
			
 
				     ggml.ggml_set_f32_1d(b, 11 * 3 + 5, -1.5)
			
 
				     assert nb[3, 5] == -1.5
			
 
				 
			
 
				-    sum_rows = ggml.ggml_sum_rows(ctx, b);
			
 
				+    sum_rows = ggml.ggml_sum_rows(ctx, b)
			
 
				     gf = ggml.ggml_build_forward(sum_rows)
			
 
				     ggml.ggml_graph_compute_with_ctx(ctx, ctypes.pointer(gf), 1)
			
 
				     np_sum_rows = np.sum(nb, axis=-1, keepdims=True)
			
@@ -147,7 +149,9 @@ def test_to_numpy_works_with_f32(ctx: Ctx) -> None:
 
				     nc = ggml.to_numpy(c)
			
 
				     assert ggml.shape(c) == (32, 22, 12)
			
 
				     assert nc[3, 5, 11] == 22 * 12 * 3 + 12 * 5 + 11
			
 
				-    assert np.allclose(nc, np.array(range(12 * 22 * 32), dtype=np.float32).reshape(ggml.shape(c)))
			
 
				+    assert np.allclose(
			
 
				+        nc, np.array(range(12 * 22 * 32), dtype=np.float32).reshape(ggml.shape(c))
			
 
				+    )
			
 
				     ggml.ggml_set_f32_1d(c, 22 * 12 * 3 + 12 * 5 + 11, -1.5)
			
 
				     assert nc[3, 5, 11] == -1.5
			
 
				 
			
@@ -240,11 +244,18 @@ def test_ning_model_load(ctx: Ctx) -> None:
 
				 
			
 
				 
			
 
				 @pytest.fixture(scope="module")
			
 
				-def g_model() -> NativeObj:
			
 
				+def g_model_once() -> Iterator[ctypes.c_void_p]:
			
 
				     model_file = Path(__file__).parent / "seamlessM4T_medium.ggml"
			
 
				     if not model_file.exists():
			
 
				         convert_model("seamlessM4T_medium", model_file)
			
 
				-    return ggml.load_unity_ggml_file(model_file)
			
 
				+    with ggml.load_unity_ggml_file(model_file) as model:
			
 
				+        yield model
			
 
				+
			
 
				+
			
 
				+@pytest.fixture()
			
 
				+def g_model(ctx: Ctx, g_model_once: ctypes.c_void_p) -> ctypes.c_void_p:
			
 
				+    ggml.lib.fairseq2_model_set_inference_ctx(g_model_once, ctx)
			
 
				+    return g_model_once
			
 
				 
			
 
				 
			
 
				 @pytest.fixture(scope="module")
			
@@ -266,18 +277,16 @@ def test_hparams_code_is_up_to_date() -> None:
 
				     assert hparams_struct in actual_code
			
 
				 
			
 
				 
			
 
				-def test_forward_linear(ctx: Ctx) -> None:
			
 
				+def test_numpy_mul_mat(ctx: Ctx) -> None:
			
 
				     slen, d_in, d_out = (5, 4, 2)
			
 
				     # torch.nn and fairseq2.nn assumes (seq_len, dim) to represent inputs,
			
 
				     x = np.zeros((slen, d_in), dtype=np.float32)  # (seq_len, dim_in)
			
 
				-    # torch.nn.init.uniform_(x, -1, 1)
			
 
				-    x[0, :] = [1, 1/3, 0, 0]
			
 
				+    x[0, :] = [1, 1 / 3, 0, 0]
			
 
				 
			
 
				-    # linear = fairseq2.nn.Linear(d_in, d_out, bias=False)
			
 
				     weight = np.eye(d_out, d_in, dtype=np.float32)
			
 
				     weight[1, 1] = 1
			
 
				     # assert weight.shape == (d_out, d_in) # (dim_out, dim_in)
			
 
				-    y_exp = (x @ weight.T)  # (seq_len, dim_out)
			
 
				+    y_exp = x @ weight.T  # (seq_len, dim_out)
			
 
				 
			
 
				     gx = ggml.from_numpy(ctx, x)  # (dim_in, seq_len)
			
 
				     gw = ggml.from_numpy(ctx, weight)  # (dim_in, dim_out)
			
@@ -294,9 +303,37 @@ def test_forward_linear(ctx: Ctx) -> None:
 
				     assert np.allclose(y_exp, y)
			
 
				 
			
 
				 
			
 
				+@torch.no_grad()
			
 
				+def test_torch_spda_vs_ggml_flash_attn(ctx: Ctx) -> None:
			
 
				+    slen, d_in, num_heads = (5, 4, 2)
			
 
				+    torch.random.manual_seed(0)
			
 
				+    q = torch.zeros((num_heads, slen, d_in))
			
 
				+    torch.nn.init.uniform_(q, -1, 1)
			
 
				+    k = torch.zeros((num_heads, slen, d_in))
			
 
				+    torch.nn.init.uniform_(k, -1, 1)
			
 
				+    v = torch.zeros((num_heads, slen, d_in))
			
 
				+    torch.nn.init.uniform_(v, -1, 1)
			
 
				+
			
 
				+    # Note: we are using x for both keys and queries, so every position
			
 
				+    # attends mostly to itself, hence y_exp looks a bit like arange(slen)
			
 
				+    y_exp = torch.nn.functional.scaled_dot_product_attention(q, k, v, is_causal=True)
			
 
				+    y_exp = y_exp.numpy()
			
 
				+    gq = ggml.from_numpy(ctx, q.numpy())
			
 
				+    gk = ggml.from_numpy(ctx, k.numpy())
			
 
				+    # ggml flash attention expect a different order of axis for v:
			
 
				+    gv = ggml.from_numpy(ctx, v.transpose(1, 2).contiguous().numpy())
			
 
				+    assert ggml.shape(gv) == (num_heads, d_in, slen)
			
 
				+    gy = ggml.ggml_flash_attn(ctx, gq, gk, gv, True)
			
 
				+    gf = ggml.ggml_build_forward(gy)
			
 
				+    ggml.ggml_graph_compute_with_ctx(ctx, ctypes.pointer(gf), 1)
			
 
				+
			
 
				+    y = ggml.to_numpy(gy)
			
 
				+    assert np.allclose(y_exp, y)
			
 
				+
			
 
				+
			
 
				 def test_forward_ffn(ctx: Ctx, g_model: NativeObj, pt_model: Any) -> None:
			
 
				-    x = torch.empty((1024))
			
 
				-    torch.nn.init.uniform_(x, -1, 1)
			
 
				+    x = torch.empty((21, 1024))  # (seq_len, model_dim)
			
 
				+    torch.nn.init.uniform_(x, -1 / 32, 1 / 32)
			
 
				 
			
 
				     # Test FFN without LayerNorm
			
 
				     y_exp = pt_model.text_encoder.layers[0].ffn(x).numpy()
			
@@ -307,14 +344,12 @@ def test_forward_ffn(ctx: Ctx, g_model: NativeObj, pt_model: Any) -> None:
 
				     gf = ggml.ggml_build_forward(gy)
			
 
				     ggml.ggml_graph_compute_with_ctx(ctx, ctypes.pointer(gf), 1)
			
 
				 
			
 
				-    y = ggml.to_numpy(gf.nodes[gf.n_nodes - 1]).reshape(-1)
			
 
				-    abs_diff = np.max(np.abs(y - y_exp))
			
 
				-    assert abs_diff < 1e-2
			
 
				-    assert np.allclose(y_exp, y, rtol=1e-3)
			
 
				+    y = ggml.to_numpy(gf.nodes[gf.n_nodes - 1])
			
 
				+    assert np.allclose(y_exp, y, rtol=2e-2, atol=1e-4)
			
 
				 
			
 
				 
			
 
				 def test_forward_layer_norm(ctx: Ctx, g_model: NativeObj, pt_model: Any) -> None:
			
 
				-    x = torch.empty((1024,))
			
 
				+    x = torch.empty((21, 1024))
			
 
				     torch.nn.init.uniform_(x, -1, 1)
			
 
				 
			
 
				     y_exp = pt_model.text_encoder.layers[0].ffn_layer_norm(x).numpy()
			
@@ -323,22 +358,21 @@ def test_forward_layer_norm(ctx: Ctx, g_model: NativeObj, pt_model: Any) -> None
 
				     gf = ggml.ggml_build_forward(gy)
			
 
				     ggml.ggml_graph_compute_with_ctx(ctx, ctypes.pointer(gf), 1)
			
 
				 
			
 
				-    y = ggml.to_numpy(gf.nodes[gf.n_nodes - 1]).reshape(-1)
			
 
				-    abs_diff = np.max(np.abs(y - y_exp))
			
 
				-    assert np.allclose(y_exp, y)
			
 
				+    y = ggml.to_numpy(gf.nodes[gf.n_nodes - 1])
			
 
				+    assert np.allclose(y_exp, y, rtol=1e-3, atol=1e-4)
			
 
				 
			
 
				 
			
 
				 def test_forward_self_attn(ctx: Ctx, g_model: NativeObj, pt_model: Any) -> None:
			
 
				-    x = torch.empty((1, 25, 1024))
			
 
				-
			
 
				+    x = torch.empty((1, 21, 1024))
			
 
				+    torch.random.manual_seed(0)
			
 
				     torch.nn.init.uniform_(x, -1, 1)
			
 
				 
			
 
				     self_attn = pt_model.text_encoder.layers[0].self_attn
			
 
				     # Replace spda by just returning queries
			
 
				     # TODO: implement spda
			
 
				-    self_attn.spda = lambda *qkv, **kwargs: qkv[0]
			
 
				+    # self_attn.spda = lambda *qkv, **kwargs: qkv[0]
			
 
				+
			
 
				 
			
 
				-    y_exp = self_attn(x, None, x, x).numpy()
			
 
				     gx = ggml.from_numpy(ctx, x)
			
 
				     gy = ggml.forward(
			
 
				         "MultiheadAttention",
			
@@ -351,7 +385,19 @@ def test_forward_self_attn(ctx: Ctx, g_model: NativeObj, pt_model: Any) -> None:
 
				     )
			
 
				     gf = ggml.ggml_build_forward(gy)
			
 
				     ggml.ggml_graph_compute_with_ctx(ctx, ctypes.pointer(gf), 1)
			
 
				+    y = ggml.to_numpy(gy)
			
 
				+    names = "ql,q,qt,qp,kl,k,kt,kp,vl,v,vt,vp,v_cont,attn,attn_p,attn_cont,attn_reshape,outl,out"
			
 
				+    assert gf.n_nodes == len(names.split(","))
			
 
				+    gf_nodes = {}
			
 
				+    for i, name in enumerate(names.split(",")):
			
 
				+        mid = ggml.to_numpy(gf.nodes[i])
			
 
				+        # print(name, mid.shape, mid)
			
 
				+        gf_nodes[name] = mid
			
 
				+
			
 
				+    breakpoint()
			
 
				+    y_exp = self_attn(x, None, x, x).numpy()
			
 
				+    y_exp = y_exp.squeeze(0)  # remove batch dimension
			
 
				 
			
 
				-    y = ggml.to_numpy(gf.nodes[gf.n_nodes - 1]).reshape(-1)
			
 
				+    assert y.shape == y_exp.shape
			
 
				     abs_diff = np.max(np.abs(y - y_exp))
			
 
				     assert np.allclose(y_exp, y)