Guillaume Wenzek 1 жил өмнө
parent
commit
fa85f05545

+ 52 - 21
ggml/examples/unity/fairseq2.cpp

@@ -7,15 +7,21 @@ extern "C" fairseq2_model* fairseq2_model_alloc() {
     auto* model = new fairseq2_model;
     model->hparams = new std::uint8_t[8 * 1024];
     model->arch = new std::uint64_t[16 * 1024];  // max tensors allowed
+    model->tensors_ctx = nullptr;
     return model;
 };
 
 extern "C" void fairseq2_model_free(fairseq2_model* model) {
+    if (model->tensors_ctx) ggml_free(model->tensors_ctx);
     delete (std::uint64_t*)(model->arch);
     delete (std::uint8_t*)model->hparams;
     delete model;
 };
 
+extern "C" void fairseq2_model_set_inference_ctx(fairseq2_model* model, ggml_context* ctx) {
+    model->ctx = ctx;
+}
+
 extern "C" std::string* std_string_alloc(char* c_str) {
     return new std::string(c_str);
 }
@@ -163,43 +169,68 @@ void MultiheadAttention_init(
 }
 
 ggml_tensor* reshape_num_head(ggml_context* ctx, ggml_tensor* x, int num_heads) {
-    int slen = x->ne[0];
-    // (S, M) -> (S, K_proj)
-    x = ggml_reshape_3d(ctx, x, slen, num_heads, x->ne[1] / num_heads);
-    // (S, K_proj) -> (H, S, K_h)
-    return ggml_transpose(ctx, x);
+    int slen = x->ne[1];
+    int model_dim = x->ne[0];
+    // (S, dim) -> (S, H, H_dim)
+    x = ggml_reshape_3d(ctx, x, model_dim / num_heads, num_heads, slen);
+    // (S, H, H_dim) -> (H, S, H_dim)
+    x = ggml_permute(ctx, x, 0, 2, 1, 3);
+    return x;
 }
 
 
 
-extern "C" ggml_tensor* // (d_in, seq_len)
+extern "C" ggml_tensor* // (slen, d_in)
 MultiheadAttention_forward(
     fairseq2_model& model,
     const std::string &prefix,
-    ggml_tensor* queries,  // (d_in, len_q)
-    ggml_tensor* keys,  // (d_in, len_k)
-    ggml_tensor* values,  // (d_out, len_k)
-    ggml_tensor* mask // (seq_len, len_q)
+    ggml_tensor* queries,  // (slen, d_in)
+    ggml_tensor* keys,  // (klen, d_in)
+    ggml_tensor* values,  // (klen, d_out)
+    ggml_tensor* _ // (klen, slen)  TODO: do we need to pass mask here ?
 ) {
+    int slen = queries->ne[1];
     int num_heads = 16;
+    int head_dim = queries->ne[0] / num_heads;
     ggml_context* ctx = model.ctx;
     ggml_tensor* q = Linear_forward(model, prefix + ".q_proj", queries);
-    q = reshape_num_head(ctx, q, num_heads);
+    q = reshape_num_head(ctx, q, num_heads);  // (H, S, H_dim)
     ggml_tensor* k = Linear_forward(model, prefix + ".k_proj", keys);
-    k = reshape_num_head(ctx, k, num_heads);
-    ggml_tensor* v = Linear_forward(model, prefix + ".q_proj", queries);
-    v = reshape_num_head(ctx, v, num_heads);
+    k = reshape_num_head(ctx, k, num_heads);  // (H, S, H_dim)
+    ggml_tensor* v = Linear_forward(model, prefix + ".v_proj", values);
+    v = ggml_reshape_3d(ctx, v, head_dim, num_heads, slen); // (S, H, H_dim)
+    // v = ggml_permute(ctx, v, 1, 2, 0, 3);  // (H, H_dim, S)
+    v = ggml_permute(ctx, v, 1, 0, 2, 3);  // (S, H_dim, H)
+    v = ggml_cont(ctx, v);
+
+    // ggml_tensor* attn = ggml_flash_attn(ctx, q, k, v, /*masked*/false);  // (H, S, H_dim)
+    attn = ggml_permute(ctx, attn, 0, 2, 1, 3);  // (S, H, H_dim)
+    attn = ggml_cont(ctx, attn);
+    attn = ggml_reshape_2d(ctx, attn, num_heads * head_dim, slen);   // (S, H * V_h)
+    attn = Linear_forward(model, prefix + ".output_proj", attn);              // (S, d_out)
 
-    ggml_tensor* attn = ggml_flash_attn(model.ctx, q, k, v, /*masked*/true);
-    attn = Linear_forward(model, prefix + ".output_proj", attn);
     return attn;
-    // ggml_tensor* attn = SDPA_forward(q, k, v, nullptr);
-    // // (H, S, V_h) -> (S, H, V_h)
-    // attn = ggml_transpose(ctx, attn);
-    // // (S, H, V_h) -> (S, V_proj)
-    // attn = ggml_reshape_3d()
 }
 
+// ggml_tensor* attn_weights = ggml_mul_mat(ctx, q, k);  // (H, S, S)
+//     attn_weights = ggm_mul * (q.size(-1) ** -0.5)
+
+//     if mask is not None:
+//         attn_weights = attn_weights + mask
+
+//     # For numerical stability run in single precision.
+//     attn_weights = softmax(attn_weights, dim=-1, dtype=torch.float32)
+
+//     attn_weights = attn_weights.type_as(q)
+
+//     if training and dropout_p > 0.0:
+//         attn_weights = dropout(attn_weights, dropout_p)
+
+//     # (*, S, S_kv) @ (*, S_kv, V) = (*, S, V)
+//     attn = torch.matmul(attn_weights, values)
+
+//     return attn, attn_weights if needs_weights else None
+
 // extern "C" ggml_tensor* // (d_out, seq_len)
 // SDPA_forward(
 //     fairseq2_model& model,

+ 8 - 1
ggml/examples/unity/fairseq2.h

@@ -7,15 +7,22 @@
 
 
 struct fairseq2_model {
-    ggml_context* ctx;
+    // Context containing all tensors memory
+    ggml_context* tensors_ctx;
+    // Named tensors, all tensors should belong to tensors_ctx
     std::map<std::string, struct ggml_tensor *> tensors;
     void* arch;
     void* hparams;
+    // an inference context, not managed by this object
+    // TODO: is this the best place to store this or should we also pass this to all forward methods ?
+    ggml_context* ctx;
 };
 
 /// allocate the fairseq2 model and hyperparameters
 extern "C" fairseq2_model* fairseq2_model_alloc();
+// free the models and all its owned tensors
 extern "C" void fairseq2_model_free(fairseq2_model* model);
+extern "C" void fairseq2_model_set_inference_ctx(fairseq2_model* model, ggml_context* ctx);
 
 extern "C" std::string* std_string_alloc(char* c_str);
 extern "C" void std_string_free(std::string* str);

+ 1 - 1
ggml/examples/unity/model_loader.cpp

@@ -29,7 +29,7 @@ model_loader::load_model_weights(fairseq2_model &model, std::ifstream &fin)
         std::string name = get_name(fin);
         if (name.length() == 0)
             break;
-        auto tensor = load_tensor_value(fin, model.ctx);
+        auto tensor = load_tensor_value(fin, model.tensors_ctx);
         if (tensor == nullptr) {
             // Abort in case of error, the input stream is corrupted at this point.
             printf("Error while reading tensor %s\n", name.c_str() );

+ 2 - 2
ggml/examples/unity/model_loader.h

@@ -51,8 +51,8 @@ int load_fairseq2_ggml_file(fairseq2_model& model, const char* fname) {
         /*.mem_buffer =*/ NULL,
         /*.no_alloc   =*/ false,
     };
-    model.ctx = ggml_init(params);
+    model.tensors_ctx = ggml_init(params);
 
-    return loader.load_model_weights(model, fin);;
+    return loader.load_model_weights(model, fin);
 }
 

+ 6 - 3
ggml/ggml.py

@@ -99,7 +99,7 @@ def _shape_to_ne(shape: Tuple[int, ...]) -> Tuple[int, int, int, int]:
     # in GGML ne[0] indicates the contiguous dimension, ie the last one in numpy and torch
     ne = shape[::-1]
     if len(ne) >= GGML_MAX_DIMS:
-        return   # type: ignore
+        return  # type: ignore
 
     # ne is always of the same length
     padding = (1,) * (GGML_MAX_DIMS - len(ne))
@@ -218,6 +218,9 @@ def GptVocab() -> NativeObj:
     return NativeObj("gpt_vocab")
 
 
+lib.fairseq2_model_set_inference_ctx.argtypes = [ctypes.c_void_p, ggml_context_p]
+
+
 def Fairseq2Model() -> NativeObj:
     return NativeObj("fairseq2_model")
 
@@ -290,7 +293,7 @@ _FORWARD_CACHE: Dict[str, Callable[..., ggml_tensor_p]] = {}
 
 
 def forward(
-    layer_name: str, model: NativeObj, prefix: str, *inputs: ggml_tensor_p
+    layer_name: str, model: ctypes.c_void_p, prefix: str, *inputs: ggml_tensor_p
 ) -> ggml_tensor_p:
     fwd: Any = _FORWARD_CACHE.get(layer_name)
     if fwd is None:
@@ -303,4 +306,4 @@ def forward(
         _FORWARD_CACHE[layer_name] = fwd
 
     with CppStr(prefix) as std_prefix:
-        return fwd(model.ptr, std_prefix, *inputs)  # ignore: type[no-any-return]
+        return fwd(model, std_prefix, *inputs)  # ignore: type[no-any-return]

+ 4 - 5
ggml/include/ggml/ggml.h

@@ -347,7 +347,7 @@ extern "C" {
         GGML_OP_NONE = 0,
 
         GGML_OP_DUP,
-        GGML_OP_ADD,
+        GGML_OP_ADD, //2
         GGML_OP_ADD1,
         GGML_OP_ACC,
         GGML_OP_SUB,
@@ -369,16 +369,15 @@ extern "C" {
         GGML_OP_RMS_NORM_BACK,
         GGML_OP_GROUP_NORM,
 
-        GGML_OP_MUL_MAT,
+        GGML_OP_MUL_MAT, //23
         GGML_OP_OUT_PROD,
-
         GGML_OP_SCALE,
         GGML_OP_SET,
         GGML_OP_CPY,
         GGML_OP_CONT,
-        GGML_OP_RESHAPE,
+        GGML_OP_RESHAPE, //29
         GGML_OP_VIEW,
-        GGML_OP_PERMUTE,
+        GGML_OP_PERMUTE, //32
         GGML_OP_TRANSPOSE,
         GGML_OP_GET_ROWS,
         GGML_OP_GET_ROWS_BACK,

+ 72 - 26
ggml/test_unity_cpp.py

@@ -52,6 +52,7 @@ def test_ggml_bindings_work(ctx: Ctx) -> None:
     output = ggml.ggml_get_f32_1d(f, 0)
     assert output == 16.0
 
+
 def test_ggml_matmul(ctx: Ctx) -> None:
     # Instantiate tensors
     a = ggml.ggml_new_tensor_2d(ctx, ggml.GGML_TYPE_F32, 4, 2)
@@ -67,7 +68,6 @@ def test_ggml_matmul(ctx: Ctx) -> None:
     for i in range(4 * 3):
         ggml.ggml_set_f32_1d(x, i, i)
 
-
     ggml.ggml_set_f32(a, 0.0)
     ggml.ggml_set_f32_1d(a, 1, 1.0)
     ggml.ggml_set_f32_1d(a, 7, 1.0)
@@ -129,11 +129,13 @@ def test_to_numpy_works_with_f32(ctx: Ctx) -> None:
     # assert nb.shape == (21, 11)
     assert nb[0, 5] == 5
     assert nb[3, 5] == 11 * 3 + 5
-    assert np.allclose(nb, np.array(range(11 * 21), dtype=np.float32).reshape(ggml.shape(b)))
+    assert np.allclose(
+        nb, np.array(range(11 * 21), dtype=np.float32).reshape(ggml.shape(b))
+    )
     ggml.ggml_set_f32_1d(b, 11 * 3 + 5, -1.5)
     assert nb[3, 5] == -1.5
 
-    sum_rows = ggml.ggml_sum_rows(ctx, b);
+    sum_rows = ggml.ggml_sum_rows(ctx, b)
     gf = ggml.ggml_build_forward(sum_rows)
     ggml.ggml_graph_compute_with_ctx(ctx, ctypes.pointer(gf), 1)
     np_sum_rows = np.sum(nb, axis=-1, keepdims=True)
@@ -147,7 +149,9 @@ def test_to_numpy_works_with_f32(ctx: Ctx) -> None:
     nc = ggml.to_numpy(c)
     assert ggml.shape(c) == (32, 22, 12)
     assert nc[3, 5, 11] == 22 * 12 * 3 + 12 * 5 + 11
-    assert np.allclose(nc, np.array(range(12 * 22 * 32), dtype=np.float32).reshape(ggml.shape(c)))
+    assert np.allclose(
+        nc, np.array(range(12 * 22 * 32), dtype=np.float32).reshape(ggml.shape(c))
+    )
     ggml.ggml_set_f32_1d(c, 22 * 12 * 3 + 12 * 5 + 11, -1.5)
     assert nc[3, 5, 11] == -1.5
 
@@ -240,11 +244,18 @@ def test_ning_model_load(ctx: Ctx) -> None:
 
 
 @pytest.fixture(scope="module")
-def g_model() -> NativeObj:
+def g_model_once() -> Iterator[ctypes.c_void_p]:
     model_file = Path(__file__).parent / "seamlessM4T_medium.ggml"
     if not model_file.exists():
         convert_model("seamlessM4T_medium", model_file)
-    return ggml.load_unity_ggml_file(model_file)
+    with ggml.load_unity_ggml_file(model_file) as model:
+        yield model
+
+
+@pytest.fixture()
+def g_model(ctx: Ctx, g_model_once: ctypes.c_void_p) -> ctypes.c_void_p:
+    ggml.lib.fairseq2_model_set_inference_ctx(g_model_once, ctx)
+    return g_model_once
 
 
 @pytest.fixture(scope="module")
@@ -266,18 +277,16 @@ def test_hparams_code_is_up_to_date() -> None:
     assert hparams_struct in actual_code
 
 
-def test_forward_linear(ctx: Ctx) -> None:
+def test_numpy_mul_mat(ctx: Ctx) -> None:
     slen, d_in, d_out = (5, 4, 2)
     # torch.nn and fairseq2.nn assumes (seq_len, dim) to represent inputs,
     x = np.zeros((slen, d_in), dtype=np.float32)  # (seq_len, dim_in)
-    # torch.nn.init.uniform_(x, -1, 1)
-    x[0, :] = [1, 1/3, 0, 0]
+    x[0, :] = [1, 1 / 3, 0, 0]
 
-    # linear = fairseq2.nn.Linear(d_in, d_out, bias=False)
     weight = np.eye(d_out, d_in, dtype=np.float32)
     weight[1, 1] = 1
     # assert weight.shape == (d_out, d_in) # (dim_out, dim_in)
-    y_exp = (x @ weight.T)  # (seq_len, dim_out)
+    y_exp = x @ weight.T  # (seq_len, dim_out)
 
     gx = ggml.from_numpy(ctx, x)  # (dim_in, seq_len)
     gw = ggml.from_numpy(ctx, weight)  # (dim_in, dim_out)
@@ -294,9 +303,37 @@ def test_forward_linear(ctx: Ctx) -> None:
     assert np.allclose(y_exp, y)
 
 
+@torch.no_grad()
+def test_torch_spda_vs_ggml_flash_attn(ctx: Ctx) -> None:
+    slen, d_in, num_heads = (5, 4, 2)
+    torch.random.manual_seed(0)
+    q = torch.zeros((num_heads, slen, d_in))
+    torch.nn.init.uniform_(q, -1, 1)
+    k = torch.zeros((num_heads, slen, d_in))
+    torch.nn.init.uniform_(k, -1, 1)
+    v = torch.zeros((num_heads, slen, d_in))
+    torch.nn.init.uniform_(v, -1, 1)
+
+    # Note: we are using x for both keys and queries, so every position
+    # attends mostly to itself, hence y_exp looks a bit like arange(slen)
+    y_exp = torch.nn.functional.scaled_dot_product_attention(q, k, v, is_causal=True)
+    y_exp = y_exp.numpy()
+    gq = ggml.from_numpy(ctx, q.numpy())
+    gk = ggml.from_numpy(ctx, k.numpy())
+    # ggml flash attention expect a different order of axis for v:
+    gv = ggml.from_numpy(ctx, v.transpose(1, 2).contiguous().numpy())
+    assert ggml.shape(gv) == (num_heads, d_in, slen)
+    gy = ggml.ggml_flash_attn(ctx, gq, gk, gv, True)
+    gf = ggml.ggml_build_forward(gy)
+    ggml.ggml_graph_compute_with_ctx(ctx, ctypes.pointer(gf), 1)
+
+    y = ggml.to_numpy(gy)
+    assert np.allclose(y_exp, y)
+
+
 def test_forward_ffn(ctx: Ctx, g_model: NativeObj, pt_model: Any) -> None:
-    x = torch.empty((1024))
-    torch.nn.init.uniform_(x, -1, 1)
+    x = torch.empty((21, 1024))  # (seq_len, model_dim)
+    torch.nn.init.uniform_(x, -1 / 32, 1 / 32)
 
     # Test FFN without LayerNorm
     y_exp = pt_model.text_encoder.layers[0].ffn(x).numpy()
@@ -307,14 +344,12 @@ def test_forward_ffn(ctx: Ctx, g_model: NativeObj, pt_model: Any) -> None:
     gf = ggml.ggml_build_forward(gy)
     ggml.ggml_graph_compute_with_ctx(ctx, ctypes.pointer(gf), 1)
 
-    y = ggml.to_numpy(gf.nodes[gf.n_nodes - 1]).reshape(-1)
-    abs_diff = np.max(np.abs(y - y_exp))
-    assert abs_diff < 1e-2
-    assert np.allclose(y_exp, y, rtol=1e-3)
+    y = ggml.to_numpy(gf.nodes[gf.n_nodes - 1])
+    assert np.allclose(y_exp, y, rtol=2e-2, atol=1e-4)
 
 
 def test_forward_layer_norm(ctx: Ctx, g_model: NativeObj, pt_model: Any) -> None:
-    x = torch.empty((1024,))
+    x = torch.empty((21, 1024))
     torch.nn.init.uniform_(x, -1, 1)
 
     y_exp = pt_model.text_encoder.layers[0].ffn_layer_norm(x).numpy()
@@ -323,22 +358,21 @@ def test_forward_layer_norm(ctx: Ctx, g_model: NativeObj, pt_model: Any) -> None
     gf = ggml.ggml_build_forward(gy)
     ggml.ggml_graph_compute_with_ctx(ctx, ctypes.pointer(gf), 1)
 
-    y = ggml.to_numpy(gf.nodes[gf.n_nodes - 1]).reshape(-1)
-    abs_diff = np.max(np.abs(y - y_exp))
-    assert np.allclose(y_exp, y)
+    y = ggml.to_numpy(gf.nodes[gf.n_nodes - 1])
+    assert np.allclose(y_exp, y, rtol=1e-3, atol=1e-4)
 
 
 def test_forward_self_attn(ctx: Ctx, g_model: NativeObj, pt_model: Any) -> None:
-    x = torch.empty((1, 25, 1024))
-
+    x = torch.empty((1, 21, 1024))
+    torch.random.manual_seed(0)
     torch.nn.init.uniform_(x, -1, 1)
 
     self_attn = pt_model.text_encoder.layers[0].self_attn
     # Replace spda by just returning queries
     # TODO: implement spda
-    self_attn.spda = lambda *qkv, **kwargs: qkv[0]
+    # self_attn.spda = lambda *qkv, **kwargs: qkv[0]
+
 
-    y_exp = self_attn(x, None, x, x).numpy()
     gx = ggml.from_numpy(ctx, x)
     gy = ggml.forward(
         "MultiheadAttention",
@@ -351,7 +385,19 @@ def test_forward_self_attn(ctx: Ctx, g_model: NativeObj, pt_model: Any) -> None:
     )
     gf = ggml.ggml_build_forward(gy)
     ggml.ggml_graph_compute_with_ctx(ctx, ctypes.pointer(gf), 1)
+    y = ggml.to_numpy(gy)
+    names = "ql,q,qt,qp,kl,k,kt,kp,vl,v,vt,vp,v_cont,attn,attn_p,attn_cont,attn_reshape,outl,out"
+    assert gf.n_nodes == len(names.split(","))
+    gf_nodes = {}
+    for i, name in enumerate(names.split(",")):
+        mid = ggml.to_numpy(gf.nodes[i])
+        # print(name, mid.shape, mid)
+        gf_nodes[name] = mid
+
+    breakpoint()
+    y_exp = self_attn(x, None, x, x).numpy()
+    y_exp = y_exp.squeeze(0)  # remove batch dimension
 
-    y = ggml.to_numpy(gf.nodes[gf.n_nodes - 1]).reshape(-1)
+    assert y.shape == y_exp.shape
     abs_diff = np.max(np.abs(y - y_exp))
     assert np.allclose(y_exp, y)