github
/
seamless_communication
mirror of https://github.com/facebookresearch/seamless_communication.git


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325
							import ggml
import ctypes
import torch
import pytest
import numpy as np
import torch
import fairseq2.nn
import fairseq2.nn.transformer
import logging
import sys
from pathlib import Path
from ctypes_utils import Ptr
from ctypes import c_void_p
from typing import Any
from pathlib import Path
from typing import Iterator
from ggml import NativeObj
from ggml_convert import convert_model
from seamless_communication.models.inference.translator import Translator, Modality

Ctx = ggml.ggml_context_p

UNITY_MODELS = Path(__file__).parent / "examples/unity/models"
CTX_PARAMS = ggml.ggml_init_params(mem_size=1024 * 1024 * 1024, mem_buffer=None)


@pytest.fixture(name="ctx")
def _ctx() -> Iterator[Ctx]:
    """Allocate a new context with 1024 MB of memory"""
    try:
        ctx = ggml.ggml_init(params=CTX_PARAMS)
        yield ctx
    finally:
        ggml.ggml_free(ctx)


def test_ggml_bindings_work(ctx: Ctx) -> None:
    # Instantiate tensors
    x = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1)
    a = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1)
    b = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1)

    # Use ggml operations to build a computational graph
    x2 = ggml.ggml_mul(ctx, x, x)
    f = ggml.ggml_add(ctx, ggml.ggml_mul(ctx, a, x2), b)

    gf = ggml.ggml_build_forward(f)

    # Set the input values
    ggml.ggml_set_f32(x, 2.0)
    ggml.ggml_set_f32(a, 3.0)
    ggml.ggml_set_f32(b, 4.0)

    # Compute the graph
    ggml.ggml_graph_compute_with_ctx(ctx, ctypes.pointer(gf), 1)

    # Get the output value
    output = ggml.ggml_get_f32_1d(f, 0)
    assert output == 16.0


def test_ggml_matmul(ctx: Ctx) -> None:
    # Instantiate tensors
    a = ggml.ggml_new_tensor_2d(ctx, ggml.GGML_TYPE_F32, 4, 2)
    x = ggml.ggml_new_tensor_2d(ctx, ggml.GGML_TYPE_F32, 4, 3)

    # Use ggml operations to build a computational graph
    y = ggml.ggml_mul_mat(ctx, a, x)
    assert ggml.shape(y) == (3, 2)
    gf = ggml.ggml_build_forward(y)

    # Set the input values
    ggml.ggml_set_f32(x, 0.0)
    for i in range(4 * 3):
        ggml.ggml_set_f32_1d(x, i, i)

    ggml.ggml_set_f32(a, 0.0)
    ggml.ggml_set_f32_1d(a, 1, 1.0)
    ggml.ggml_set_f32_1d(a, 7, 1.0)
    ggml.ggml_graph_compute_with_ctx(ctx, ctypes.pointer(gf), 1)
    output = [[ggml.ggml_get_f32_1d(y, j * 2 + i) for j in range(3)] for i in range(2)]
    assert output == [[1, 5, 9], [3, 7, 11]]


def test_shape_works(ctx: Ctx) -> None:
    """GGML shape order convention is the reverse from numpy"""
    a = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 10)
    assert ggml.shape(a) == (10,)

    b = ggml.ggml_new_tensor_2d(ctx, ggml.GGML_TYPE_F32, 11, 21)
    assert ggml.shape(b) == (21, 11)

    c = ggml.ggml_new_tensor_3d(ctx, ggml.GGML_TYPE_F32, 12, 22, 32)
    assert ggml.shape(c) == (32, 22, 12)


def test_nb_works(ctx: Ctx) -> None:
    a = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 10)
    assert ggml.nb(a) == (4, 40, 40, 40)

    b = ggml.ggml_new_tensor_2d(ctx, ggml.GGML_TYPE_F16, 11, 21)
    assert ggml.nb(b) == (2, 22, 462, 462)

    c = ggml.ggml_new_tensor_3d(ctx, ggml.GGML_TYPE_F32, 12, 22, 32)
    assert ggml.nb(c) == (4, 48, 1056, 33792)


def test_strides_works(ctx: Ctx) -> None:
    a = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 10)
    assert ggml.strides(a) == np.ones((10,), dtype=np.float32).strides

    b = ggml.ggml_new_tensor_2d(ctx, ggml.GGML_TYPE_F32, 11, 21)
    assert ggml.strides(b) == np.ones((21, 11), dtype=np.float32).strides

    c = ggml.ggml_new_tensor_3d(ctx, ggml.GGML_TYPE_F32, 12, 22, 32)
    assert ggml.strides(c) == np.ones((32, 22, 12), dtype=np.float32).strides


def test_to_numpy_works_with_f32(ctx: Ctx) -> None:
    a = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 10)
    na = ggml.to_numpy(a)
    for i in range(10):
        ggml.ggml_set_f32_1d(a, i, i)
    assert na[5] == 5
    assert np.allclose(na, np.array(range(10), dtype=np.float32))
    ggml.ggml_set_f32_1d(a, 5, -1.5)
    assert na[5] == -1.5

    # Note: GGML order of dims is reversed wrt numpy shapes
    b = ggml.ggml_new_tensor_2d(ctx, ggml.GGML_TYPE_F32, 11, 21)
    for i in range(11 * 21):
        ggml.ggml_set_f32_1d(b, i, i)
    nb = ggml.to_numpy(b)
    # assert nb.shape == (21, 11)
    assert nb[0, 5] == 5
    assert nb[3, 5] == 11 * 3 + 5
    assert np.allclose(
        nb, np.array(range(11 * 21), dtype=np.float32).reshape(ggml.shape(b))
    )
    ggml.ggml_set_f32_1d(b, 11 * 3 + 5, -1.5)
    assert nb[3, 5] == -1.5

    sum_rows = ggml.ggml_sum_rows(ctx, b)
    gf = ggml.ggml_build_forward(sum_rows)
    ggml.ggml_graph_compute_with_ctx(ctx, ctypes.pointer(gf), 1)
    np_sum_rows = np.sum(nb, axis=-1, keepdims=True)
    assert np_sum_rows.shape == ggml.shape(sum_rows)
    for i in range(11):
        assert np_sum_rows[i] == ggml.ggml_get_f32_1d(sum_rows, i)

    c = ggml.ggml_new_tensor_3d(ctx, ggml.GGML_TYPE_F32, 12, 22, 32)
    for i in range(12 * 22 * 32):
        ggml.ggml_set_f32_1d(c, i, i)
    nc = ggml.to_numpy(c)
    assert ggml.shape(c) == (32, 22, 12)
    assert nc[3, 5, 11] == 22 * 12 * 3 + 12 * 5 + 11
    assert np.allclose(
        nc, np.array(range(12 * 22 * 32), dtype=np.float32).reshape(ggml.shape(c))
    )
    ggml.ggml_set_f32_1d(c, 22 * 12 * 3 + 12 * 5 + 11, -1.5)
    assert nc[3, 5, 11] == -1.5


def test_from_numpy_works_with_f32(ctx: Ctx) -> None:
    a = np.random.normal(size=(10,)).astype(dtype=np.float32)
    ga = ggml.from_numpy(ctx, a)
    assert ggml.shape(ga) == (10,)
    assert ggml.nb(ga) == ggml.nb(ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 10))
    assert np.allclose(a, ggml.to_numpy(ga))

    a = np.random.normal(size=(11, 21)).astype(dtype=np.float32)
    ga = ggml.from_numpy(ctx, a)
    assert ggml.shape(ga) == (11, 21)
    assert ggml.nb(ga) == ggml.nb(
        ggml.ggml_new_tensor_2d(ctx, ggml.GGML_TYPE_F32, *a.shape[::-1])
    )
    assert np.allclose(a, ggml.to_numpy(ga))

    a = np.random.normal(size=(12, 22, 32)).astype(dtype=np.float32)
    ga = ggml.from_numpy(ctx, a)
    assert ggml.shape(ga) == (12, 22, 32)
    assert ggml.nb(ga) == ggml.nb(
        ggml.ggml_new_tensor_3d(ctx, ggml.GGML_TYPE_F32, *a.shape[::-1])
    )
    assert np.allclose(a, ggml.to_numpy(ga))


def test_to_numpy_works_with_f16(ctx: Ctx) -> None:
    # We explicitly fill the tensor otherwise they might have non-zero values in them.
    a = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F16, 10)
    na = ggml.to_numpy(a)
    ggml.ggml_set_f32(a, 2.14)
    assert np.allclose(na, np.ones((10,), dtype=np.float16) * 2.14)
    ggml.ggml_set_f32(a, 4.28)
    assert np.allclose(na, np.ones((10,), dtype=np.float16) * 4.28)

    b = ggml.ggml_new_tensor_2d(ctx, ggml.GGML_TYPE_F16, 11, 21)
    nb = ggml.to_numpy(b)
    ggml.ggml_set_f32(b, 4.18)
    assert np.allclose(nb, np.ones((21, 11), dtype=np.float16) * 4.18)
    ggml.ggml_set_f32(b, 5.12)
    assert np.allclose(nb, np.ones((21, 11), dtype=np.float16) * 5.12)

    c = ggml.ggml_new_tensor_3d(ctx, ggml.GGML_TYPE_F16, 12, 22, 32)
    nc = ggml.to_numpy(c)
    ggml.ggml_set_f32(c, 3.16)
    assert np.allclose(nc, np.ones((32, 22, 12), dtype=np.float16) * 3.16)
    ggml.ggml_set_f32(c, 5.08)
    assert np.allclose(nc, np.ones((32, 22, 12), dtype=np.float16) * 5.08)


def test_from_numpy_works_with_f16(ctx: Ctx) -> None:
    a = np.random.normal(size=(10,)).astype(dtype=np.float16)
    ga = ggml.from_numpy(ctx, a)
    assert np.allclose(a, ggml.to_numpy(ga))
    a = np.random.normal(size=(11, 21)).astype(dtype=np.float16)
    ga = ggml.from_numpy(ctx, a)
    assert np.allclose(a, ggml.to_numpy(ga))
    a = np.random.normal(size=(12, 22, 32)).astype(dtype=np.float16)
    ga = ggml.from_numpy(ctx, a)
    assert np.allclose(a, ggml.to_numpy(ga))


def test_to_numpy_works_with_transposed(ctx: Ctx) -> None:
    ga = ggml.ggml_new_tensor_2d(ctx, ggml.GGML_TYPE_F32, 10, 5)
    a = ggml.to_numpy(ga)
    a[...] = np.arange(50).reshape(5, 10).astype(dtype=np.float32)

    gat = ggml.ggml_transpose(ctx, ga)

    gf = ggml.ggml_build_forward(ga)
    ggml.ggml_graph_compute_with_ctx(ctx, ctypes.pointer(gf), 1)

    at = ggml.to_numpy(gat)

    assert np.allclose(a.T, at)


def test_ggml_slice(ctx: Ctx) -> None:
    ga = ggml.ggml_new_tensor_2d(ctx, ggml.GGML_TYPE_F32, 10, 5)
    a = ggml.to_numpy(ga)
    a[...] = np.arange(50).reshape(5, 10).astype(dtype=np.float32)

    gs0 = ggml.ggml_slice(ctx, ga, 0, 3, 7)
    gf = ggml.ggml_build_forward(ga)
    ggml.ggml_graph_compute_with_ctx(ctx, ctypes.pointer(gf), 1)
    s0 = ggml.to_numpy(gs0)

    assert np.allclose(a[:, 3:7], s0)

    gs1 = ggml.ggml_slice(ctx, ga, 1, 2, 5)
    gf = ggml.ggml_build_forward(ga)
    ggml.ggml_graph_compute_with_ctx(ctx, ctypes.pointer(gf), 1)
    s1 = ggml.to_numpy(gs1)
    assert np.allclose(a[2:5, :], s1)


def test_numpy_mul_mat(ctx: Ctx) -> None:
    slen, d_in, d_out = (5, 4, 2)
    # torch.nn and fairseq2.nn assumes (seq_len, dim) to represent inputs,
    x = np.zeros((slen, d_in), dtype=np.float32)  # (seq_len, dim_in)
    x[0, :] = [1, 1 / 3, 0, 0]

    weight = np.eye(d_out, d_in, dtype=np.float32)
    weight[1, 1] = 1
    # assert weight.shape == (d_out, d_in) # (dim_out, dim_in)
    y_exp = x @ weight.T  # (seq_len, dim_out)

    gx = ggml.from_numpy(ctx, x)  # (dim_in, seq_len)
    gw = ggml.from_numpy(ctx, weight)  # (dim_in, dim_out)
    # gb = ggml.from_numpy(ctx, linear.bias.numpy())  # (dim_out)
    # GGML linear impl
    assert ggml.ggml_can_mul_mat(gw, gx)
    # gy = ggml.ggml_add(ctx, ggml.ggml_mul_mat(ctx, gw, gx), gb)  # (dim_out, seq_len)
    gy = ggml.ggml_mul_mat(ctx, gw, gx)  # (dim_out, seq_len)

    gf = ggml.ggml_build_forward(gy)
    ggml.ggml_graph_compute_with_ctx(ctx, ctypes.pointer(gf), 1)

    y = ggml.to_numpy(gf.nodes[gf.n_nodes - 1])
    assert np.allclose(y_exp, y)


@torch.no_grad()
def test_torch_spda_vs_ggml_flash_attn(ctx: Ctx) -> None:
    slen, d_in, num_heads = (5, 4, 2)
    torch.random.manual_seed(0)
    q = torch.zeros((num_heads, slen, d_in))
    torch.nn.init.uniform_(q, -1, 1)
    k = torch.zeros((num_heads, slen, d_in))
    torch.nn.init.uniform_(k, -1, 1)
    v = torch.zeros((num_heads, slen, d_in))
    torch.nn.init.uniform_(v, -1, 1)

    # Note: we are using x for both keys and queries, so every position
    # attends mostly to itself, hence y_exp looks a bit like arange(slen)
    y_exp = torch.nn.functional.scaled_dot_product_attention(q, k, v, is_causal=True)
    y_exp = y_exp.numpy()
    gq = ggml.from_numpy(ctx, q.numpy())
    gk = ggml.from_numpy(ctx, k.numpy())
    # ggml flash attention expect a different order of axis for v:
    # (H, slen, H_dim) -> (H, H_dim, slen)
    gv = ggml.from_numpy(ctx, v.transpose(1, 2).contiguous().numpy())
    assert ggml.shape(gv) == (num_heads, d_in, slen)
    gy = ggml.ggml_flash_attn(ctx, gq, gk, gv, True)
    gf = ggml.ggml_build_forward(gy)
    ggml.ggml_graph_compute_with_ctx(ctx, ctypes.pointer(gf), 1)

    y = ggml.to_numpy(gy)
    assert np.allclose(y_exp, y)


def test_ggml_softmax_vs_torch(ctx: Ctx) -> None:
    x = torch.empty((5, 8, 4))
    torch.nn.init.uniform_(x, -1, 1)
    y_exp = torch.softmax(x, dim=-1).numpy()

    gx = ggml.from_numpy(ctx, x.numpy())
    gy = ggml.ggml_soft_max(ctx, gx)
    y = ggml.to_numpy(gy)

    gf = ggml.ggml_build_forward(gy)
    ggml.ggml_graph_compute_with_ctx(ctx, ctypes.pointer(gf), 1)

    assert np.allclose(y_exp, y, rtol=1e-3)