1 年之前 · 28ed039370
--- a/ggml/examples/unity/fairseq2.cpp
+++ b/ggml/examples/unity/fairseq2.cpp
@@ -93,14 +93,11 @@ extern "C" ggml_tensor* StandardFeedForwardNetwork_forward(
 
				 }
			
 
				 
			
 
				 
			
 
				-/// Merge the given dimension and the previous one in the tensor.
			
 
				-/// (..., num_heads, N, ...) -> (..., num_heads * N, ...)
			
 
				-/// dim is the position of the resulting merged dimension
			
 
				-/// ggml_flatten_1d(x, d) <==> torch.flatten(x, -1-d-1, -1-d)
			
 
				 ggml_tensor* ggml_flatten_1d(ggml_context* ctx, ggml_tensor* x, int dim) {
			
 
				     int n_dims = x->n_dims;
			
 
				     GGML_ASSERT(dim >= 0);
			
 
				     GGML_ASSERT(dim < n_dims);
			
 
				+    GGML_ASSERT(ggml_is_contiguous(x));
			
 
				     // Nothing to do
			
 
				     if (dim == n_dims - 1) return x;
			
 
				 
			
@@ -123,9 +120,6 @@ ggml_tensor* ggml_flatten_1d(ggml_context* ctx, ggml_tensor* x, int dim) {
 
				     }
			
 
				 }
			
 
				 
			
 
				-/// Split the given dimension.
			
 
				-/// (..., K * N, ...) -> (..., K, N, ...)
			
 
				-/// dim is the position of the output dimension with the given number of element (N).
			
 
				 ggml_tensor* ggml_unflatten_1d(ggml_context* ctx, ggml_tensor* x, int dim, int num_el) {
			
 
				     int n_dims = x->n_dims;
			
 
				     GGML_ASSERT(dim >= 0);
			
@@ -137,7 +131,7 @@ ggml_tensor* ggml_unflatten_1d(ggml_context* ctx, ggml_tensor* x, int dim, int n
 
				         if (dim == 0) {
			
 
				             return ggml_reshape_3d(ctx, x, num_el, x->ne[0] / num_el, x->ne[1]);
			
 
				         } else { // dim == 1
			
 
				-            return ggml_reshape_3d(ctx, x, num_el, x->ne[0] / num_el, x->ne[1]);
			
 
				+            return ggml_reshape_3d(ctx, x, x->ne[0], num_el, x->ne[1] / num_el);
			
 
				         }
			
 
				     } else { // (n_dims == 3)
			
 
				         if (dim == 0) {
			
@@ -154,8 +148,9 @@ ggml_tensor* ggml_unflatten_1d(ggml_context* ctx, ggml_tensor* x, int dim, int n
 
				 ggml_tensor* _reshape_num_head(ggml_context* ctx, ggml_tensor* x, int head_dim) {
			
 
				     // (B, S, dim) -> (B, S, H, H_dim)
			
 
				     x = ggml_unflatten_1d(ctx, x, 0, head_dim);
			
 
				-    // (B?, S, H, H_dim) -> (B?, H, S, H_dim)
			
 
				-    x = ggml_permute(ctx, x, 0, 2, 1, 3);
			
 
				+    x = ggml_permute(ctx, x, 0, 2, 1, 3); // (B, H, S, H_dim)
			
 
				+    x = ggml_cont(ctx, x);
			
 
				+    x = ggml_flatten_1d(ctx, x, 2);  // (B * H, S, H_dim)
			
 
				     return x;
			
 
				 }
			
 
				 
			
@@ -164,6 +159,8 @@ ggml_tensor* _reshape_num_head_values(ggml_context* ctx, ggml_tensor* v, int hea
 
				     // (B, Sk, dim) -> (B, Sk, H, H_dim)
			
 
				     v = ggml_unflatten_1d(ctx, v, 0, head_dim);
			
 
				     v = ggml_permute(ctx, v, 1, 2, 0, 3);  // (B?, H, H_dim, Sk)
			
 
				+    v = ggml_cont(ctx, v);
			
 
				+    v = ggml_flatten_1d(ctx, v, 2);  // (B * H, S, H_dim)
			
 
				     return v;
			
 
				 }
			
 
				 
			
@@ -186,27 +183,27 @@ extern "C" ggml_tensor* MultiheadAttention_forward(
 
				     GGML_ASSERT(model_dim % num_heads == 0);
			
 
				 
			
 
				     ggml_context* ctx = model.ctx;
			
 
				-    ggml_tensor* q = Linear_forward(model, prefix + ".q_proj", queries);
			
 
				-    q = _reshape_num_head(ctx, q, head_dim);  // (B, H, S, H_dim)
			
 
				+    ggml_tensor* q = Linear_forward(model, prefix + ".q_proj", queries); // (B, S, H * H_dim)
			
 
				     ggml_set_name(q, "q");
			
 
				+    q = _reshape_num_head(ctx, q, head_dim);  // (B * H, S, H_dim)
			
 
				     ggml_tensor* k = Linear_forward(model, prefix + ".k_proj", keys);
			
 
				-    k = _reshape_num_head(ctx, k, head_dim);  // (B, H, Sk, H_dim)
			
 
				     ggml_set_name(k, "k");
			
 
				+    k = _reshape_num_head(ctx, k, head_dim);  // (B * H, Sk, H_dim)
			
 
				 
			
 
				     ggml_tensor* v = Linear_forward(model, prefix + ".v_proj", values);
			
 
				-    v = _reshape_num_head_values(ctx, v, head_dim); // (B, H, H_dim, Sk)
			
 
				-    v = ggml_cont(ctx, v);
			
 
				     ggml_set_name(v, "v");
			
 
				+    v = _reshape_num_head_values(ctx, v, head_dim); // (B * H, H_dim, Sk)
			
 
				+    v = ggml_cont(ctx, v);
			
 
				 
			
 
				 #if UNITY_FLASH_ATTN
			
 
				     // For flash_attn, we assume either no masks, or triangular masks.
			
 
				-    ggml_tensor* attn = ggml_flash_attn(ctx, q, k, v, /*masked*/mask != nullptr);  // (H, S, H_dim)
			
 
				+    ggml_tensor* attn = ggml_flash_attn(ctx, q, k, v, /*masked*/mask != nullptr);  // (B * H, S, H_dim)
			
 
				     ggml_set_name(attn, "attn");
			
 
				+    // TODO test !
			
 
				+    attn = ggml_unflatten_1d(ctx, attn, 2, num_heads);  // (B, H, H_dim, S)
			
 
				     attn = ggml_permute(ctx, attn, 0, 2, 1, 3); // (B, S, H, H_dim)
			
 
				-    attn = ggml_cont(ctx, attn);
			
 
				-    attn = ggml_flatten_1d(ctx, attn, 0); // (B, S, H * H_dim)
			
 
				 #else
			
 
				-    // (B, H, Sk, H_dim) x (B, H, S, H_dim) -> (B, H, S, Sk)
			
 
				+    // (B * H, Sk, H_dim) x (B * H, S, H_dim) -> (B * H, S, Sk)
			
 
				     ggml_tensor* qk = ggml_mul_mat(ctx, k, q);
			
 
				     ggml_set_name(qk, "qk");
			
 
				     ggml_tensor* qk_scale = ggml_new_tensor_1d(ctx, qk->type, 1);
			
@@ -217,17 +214,17 @@ extern "C" ggml_tensor* MultiheadAttention_forward(
 
				     // TODO: Should we replace this by ggml_diag_mask_inf ?
			
 
				     if (mask) qk = ggml_add(ctx, qk, mask);
			
 
				     // TODO: upgrade qk to float32 if needed
			
 
				-    ggml_tensor* attn_weights = ggml_soft_max(ctx, qk);  // (B, H, S, Sk)
			
 
				+    ggml_tensor* attn_weights = ggml_soft_max(ctx, qk);  // (B * H, S, Sk)
			
 
				     ggml_set_name(attn_weights, "attn_weights");
			
 
				 
			
 
				-    // (B, H, S, Sk) x (B, H, H_dim, Sk) -> (B, H, H_dim, S)
			
 
				+    // (B * H, S, Sk) x (B * H, H_dim, Sk) -> (B * H, H_dim, S)
			
 
				     ggml_tensor* attn = ggml_mul_mat(ctx, attn_weights, v);
			
 
				     ggml_set_name(attn, "attn");
			
 
				-    attn = ggml_flatten_1d(ctx, attn, 1); // (B, H * H_dim, S)
			
 
				-    attn = ggml_transpose(ctx, attn); // (B, S, H * H_dim)
			
 
				-    // // I'm not sure why this one is needed ...
			
 
				-    attn = ggml_cont(ctx, attn);
			
 
				+    attn = ggml_unflatten_1d(ctx, attn, 2, num_heads);  // (B, H, H_dim, S)
			
 
				+    attn = ggml_permute(ctx, attn, 2, 0, 1, 3); // (B, S, H, H_dim)
			
 
				 #endif  // UNITY_FLASH_ATTN
			
 
				+    attn = ggml_cont(ctx, attn);
			
 
				+    attn = ggml_flatten_1d(ctx, attn, 0); // (B, S, H * H_dim)
			
 
				     // out -> (B, S, d_out)
			
 
				     ggml_tensor* out = Linear_forward(model, prefix + ".output_proj", attn);
			
 
				     ggml_set_name(out, "out");
			
--- a/ggml/examples/unity/fairseq2.h
+++ b/ggml/examples/unity/fairseq2.h
@@ -35,6 +35,17 @@ extern "C" ggml_tensor* ggml_slice(
 
				     int64_t end
			
 
				 );
			
 
				 
			
 
				+/// Merge the given dimension and the previous one in the tensor.
			
 
				+/// (..., num_heads, N, ...) -> (..., num_heads * N, ...)
			
 
				+/// dim is the position of the resulting merged dimension
			
 
				+/// ggml_flatten_1d(x, d) <==> torch.flatten(x, -1-d-1, -1-d0
			
 
				+extern "C" ggml_tensor* ggml_flatten_1d(ggml_context* ctx, ggml_tensor* x, int dim);
			
 
				+
			
 
				+/// Split the given dimension.
			
 
				+/// (..., K * N, ...) -> (..., K, N, ...)
			
 
				+/// dim is the position of the output dimension with the given number of element (N).
			
 
				+extern "C" ggml_tensor* ggml_unflatten_1d(ggml_context* ctx, ggml_tensor* x, int dim, int num_el);
			
 
				+
			
 
				 extern "C" ggml_tensor* Linear_forward(
			
 
				     fairseq2_model& model,
			
 
				     const std::string &prefix,
			
--- a/ggml/ggml.py
+++ b/ggml/ggml.py
@@ -141,7 +141,7 @@ def _strided_to_numpy(tensor_p: ggml_tensor_p) -> np.ndarray:
 
				     res = _void_p_to_np_array(tensor.data, tuple(full_shape), numpy_dtype(tensor.type))
			
 
				 
			
 
				     # Extract the correct slice
			
 
				-    res = res.__getitem__(*[slice(0, n) for n in t_shape])
			
 
				+    res = res.__getitem__(tuple(slice(0, n) for n in t_shape))
			
 
				     # TODO: we could handle transposition here
			
 
				 
			
 
				     return res
			
@@ -175,7 +175,7 @@ def _shape_to_ne(shape: Tuple[int, ...]) -> Tuple[int, int, int, int]:
 
				     # in GGML ne[0] indicates the contiguous dimension, ie the last one in numpy and torch
			
 
				     ne = shape[::-1]
			
 
				     if len(ne) >= GGML_MAX_DIMS:
			
 
				-        return ne # type: ignore
			
 
				+        return ne  # type: ignore
			
 
				 
			
 
				     # ne is always of the same length
			
 
				     padding = (1,) * (GGML_MAX_DIMS - len(ne))
			
@@ -388,6 +388,20 @@ def ggml_slice(
 
				     ...
			
 
				 
			
 
				 
			
 
				+@c_fn(lib)
			
 
				+def ggml_flatten_1d(
			
 
				+    ctx: ggml_context_p, a: Ptr[ggml_tensor], dim: int
			
 
				+) -> Ptr[ggml_tensor]:
			
 
				+    return a
			
 
				+
			
 
				+
			
 
				+@c_fn(lib)
			
 
				+def ggml_unflatten_1d(
			
 
				+    ctx: ggml_context_p, a: Ptr[ggml_tensor], dim: int, num_el: int
			
 
				+) -> Ptr[ggml_tensor]:
			
 
				+    return a
			
 
				+
			
 
				+
			
 
				 @c_struct
			
 
				 class SequenceGeneratorOptions:
			
 
				     beam_size: int
			
--- a/ggml/test_unity_cpp.py
+++ b/ggml/test_unity_cpp.py
@@ -167,7 +167,7 @@ def test_MultiheadAttention_forward(ctx: Ctx, g_model: c_void_p, pt_model: Any)
 
				     # Note: we use different lengths for queries and keys,
			
 
				     # this tests the implementation in decoding context too.
			
 
				     # Note2: ggml_flash_attn requires that we have more keys than queries
			
 
				-    gxq = ggml.from_numpy(ctx, x[:, :11, :])
			
 
				+    gxq = ggml.from_numpy(ctx, x[:, :11, :].contiguous())
			
 
				     gx = ggml.from_numpy(ctx, x)
			
 
				     ggml.ggml_set_name(gx, b"x")
			
 
				     gy = ggml.forward(
			
@@ -182,7 +182,7 @@ def test_MultiheadAttention_forward(ctx: Ctx, g_model: c_void_p, pt_model: Any)
 
				     gf = ggml.ggml_build_forward(gy)
			
 
				     ggml.ggml_graph_compute_with_ctx(ctx, ctypes.pointer(gf), 1)
			
 
				 
			
 
				-    q_exp = self_attn._project_q(x[:, :11, :], None, None).numpy()
			
 
				+    q_exp = self_attn.q_proj(x[:, :11, :]).numpy()
			
 
				 
			
 
				     y = ggml.to_numpy(gy)
			
 
				     nodes = {}
			
@@ -206,13 +206,13 @@ def test_MultiheadAttention_forward(ctx: Ctx, g_model: c_void_p, pt_model: Any)
 
				     if not UNITY_FLASH_ATTN:
			
 
				         attn_weights = nodes[b"attn_weights"]
			
 
				         [attn_weights_exp] = attn_weights_hook._storage
			
 
				-        # Fix the shape of attn_weights_exp
			
 
				-        attn_weights_exp = attn_weights_exp.unflatten(0, (2, 16)).numpy()
			
 
				+        attn_weights_exp = attn_weights_exp.numpy()
			
 
				         assert attn_weights_exp.shape == attn_weights.shape
			
 
				-        # GGML is very agressively reducing small softmax weights to 0.
			
 
				-        # assert np.allclose(attn_weights_exp, attn_weights, atol=1e-3)
			
 
				+        # GGML is very agressively reducing small softmax weights to 0,
			
 
				+        # so the error isn't that small
			
 
				+        assert np.allclose(attn_weights_exp, attn_weights, atol=1e-3)
			
 
				         # But the sums should be close to 1
			
 
				-        assert np.allclose(np.sum(attn_weights, axis=-1), np.ones((2, 16, 11)))
			
 
				+        assert np.allclose(np.sum(attn_weights, axis=-1), np.ones((2 * 16, 11)))
			
 
				         # And the maximum index should match the original ones.
			
 
				         assert np.allclose(
			
 
				             np.argmax(attn_weights_exp, axis=-1), np.argmax(attn_weights, axis=-1)