Guillaume Wenzek 1 an în urmă
părinte
comite
c2e6384e29

+ 11 - 5
ggml/examples/unity/fairseq2.cpp

@@ -50,15 +50,21 @@ void Linear_init(
     }
 }
 
-extern "C" ggml_tensor* Linear_forward(
+extern "C" ggml_tensor*
+Linear_forward(
     fairseq2_model& model,
     const std::string &prefix,
-    ggml_tensor* input
+    ggml_tensor* input  // (d_in)
 ) {
-    ggml_tensor* weight = model.tensors[prefix + ".weight"];
-    ggml_tensor* bias = model.tensors[prefix + ".bias"];
+    // Note: for now we assumed un-batched input
+    ggml_tensor* weight = model.tensors[prefix + ".weight"];  // (d_in, d_out)
+    ggml_tensor* bias = model.tensors[prefix + ".bias"];  // (d_out)
 
-    return ggml_add(model.ctx, ggml_mul_mat(model.ctx, weight, input), bias);
+    return ggml_add(
+        model.ctx,
+        ggml_mul_mat(model.ctx, weight, input),  // (d_out)
+        bias
+    );
 }
 
 // LayerNorm

+ 13 - 3
ggml/examples/unity/model_loader.cpp

@@ -21,7 +21,7 @@ std::ifstream open_ggml_file(const char* fname) {
     return fin;
 }
 
-void
+int
 model_loader::load_model_weights(fairseq2_model &model, std::ifstream &fin)
 {
     size_t total_size = 0;
@@ -30,6 +30,11 @@ model_loader::load_model_weights(fairseq2_model &model, std::ifstream &fin)
         if (name.length() == 0)
             break;
         auto tensor = load_tensor_value(fin, model.ctx);
+        if (tensor == nullptr) {
+            // Abort in case of error, the input stream is corrupted at this point.
+            printf("Error while reading tensor %s\n", name.c_str() );
+            return 1;
+        }
         model.tensors[name] = tensor;
         if (DEBUG_MODEL_LOAD) {
             printf("%s [%5ld, %5ld], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), tensor->ne[0], tensor->ne[1], ggml_type_name(tensor->type), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
@@ -38,21 +43,26 @@ model_loader::load_model_weights(fairseq2_model &model, std::ifstream &fin)
     }
 
     printf("%s: model size  = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
+    return 0;
 };
 
 ggml_tensor* load_tensor_value(std::ifstream &fin, ggml_context* ctx)
 {
-    int32_t n_dims;
-    int32_t raw_type;
+    int32_t n_dims = 0;
+    int32_t raw_type = 0;
 
     fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
     fin.read(reinterpret_cast<char *>(&raw_type),  sizeof(raw_type));
     ggml_type type = ggml_type(raw_type);
 
+    if (n_dims <= 0 || n_dims > GGML_MAX_DIMS || raw_type < 0 || raw_type > GGML_TYPE_COUNT) {
+        return nullptr;
+    }
     int64_t ne[4] = {1, 1, 1, 1};
     for (int i = 0; i < n_dims; ++i) {
         fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
     }
+
     ggml_tensor* tensor = ggml_new_tensor(ctx, type, n_dims, ne);
     fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
     return tensor;

+ 3 - 3
ggml/examples/unity/model_loader.h

@@ -27,7 +27,7 @@ public:
 
     virtual void tensors_alloc(fairseq2_model& model) = 0;
 
-    void load_model_weights(fairseq2_model &model, std::ifstream &fin);
+    int load_model_weights(fairseq2_model &model, std::ifstream &fin);
 
 private:
     ggml_tensor * next_tensor(std::ifstream &fin, fairseq2_model &model);
@@ -40,7 +40,7 @@ ggml_tensor* load_tensor_value(std::ifstream &fin, ggml_context* ctx);
 std::ifstream open_ggml_file(const char* fname);
 
 template<typename T>
-void load_fairseq2_ggml_file(fairseq2_model& model, const char* fname) {
+int load_fairseq2_ggml_file(fairseq2_model& model, const char* fname) {
     T loader;
     auto fin = open_ggml_file(fname);
     loader.load_hparams(model, fin);
@@ -53,6 +53,6 @@ void load_fairseq2_ggml_file(fairseq2_model& model, const char* fname) {
     };
     model.ctx = ggml_init(params);
 
-    loader.load_model_weights(model, fin);;
+    return loader.load_model_weights(model, fin);;
 }
 

+ 1 - 1
ggml/examples/unity/unity.cpp

@@ -415,7 +415,7 @@ extern "C" bool unity_model_load(const char* fname, unity_model& model, gpt_voca
             std::string name(length, 0);
             fin.read(&name[0], length);
 
-            std::cout << "loading " << name << " " << n_dims << std::endl;
+            // std::cout << "loading " << name << " " << n_dims << std::endl;
 
             if (model.tensors.find(name) == model.tensors.end()) {
                 fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str());

+ 1 - 1
ggml/examples/unity/unity_model_loader.cpp

@@ -68,7 +68,7 @@ void unity_model_loader::tensors_alloc(fairseq2_model &model)
     // tensor_map["text_decoder.layer_norm.bias"] = arch.layer_norm_b;
 };
 
-extern "C" void load_unity_ggml_file(fairseq2_model& model, const char* fname) {
+extern "C" int load_unity_ggml_file(fairseq2_model& model, const char* fname) {
     return load_fairseq2_ggml_file<unity_model_loader>(model, fname);
 }
 

+ 40 - 10
ggml/ggml.py

@@ -8,7 +8,6 @@ import ctypes
 import torch
 import functools
 from pathlib import Path
-from typing import Self
 from typing import Dict
 from typing import Callable
 from typing import Any
@@ -48,11 +47,20 @@ def shape(tensor: Union[ggml_tensor, ggml_tensor_p]) -> Tuple[int, ...]:
     return tuple([tensor.ne[i] for i in range(ndims)])
 
 
+def nb(tensor: Union[ggml_tensor, ggml_tensor_p]) -> Tuple[int, ...]:
+    if isinstance(tensor, ctypes._Pointer):
+        tensor = tensor.contents
+    return tuple([tensor.nb[i] for i in range(4)])
+
+
 def strides(tensor: Union[ggml_tensor, ggml_tensor_p]) -> Tuple[int, ...]:
+    raise NotImplementedError()
     if isinstance(tensor, ctypes._Pointer):
         tensor = tensor.contents
     ndims = tensor.n_dims
-    return tuple([tensor.nb[i] for i in range(ndims)])
+    num_bytes = tuple([tensor.nb[i] for i in range(ndims)])
+    # TODO: convert to numpy strides
+    return num_bytes
 
 
 def to_numpy(tensor: Union[ggml_tensor, ggml_tensor_p]) -> np.ndarray:
@@ -77,6 +85,7 @@ def to_numpy(tensor: Union[ggml_tensor, ggml_tensor_p]) -> np.ndarray:
 
 
 GgmlShape = ctypes.c_int64 * GGML_MAX_DIMS
+GgmlNBytes = ctypes.c_uint64 * GGML_MAX_DIMS
 
 
 def from_file(
@@ -88,19 +97,37 @@ def from_file(
 
 def _pad_shape(shape: Tuple[int, ...]) -> Tuple[int, int, int, int]:
     if len(shape) >= 4:
-        return shape
+        return shape  # type: ignore
 
     padding = (1,) * (4 - len(shape))
     return shape + padding  # type: ignore
 
 
-def from_numpy(ctx: ggml_context_p, array: Union[np.ndarray, "torch.Tensor"]) -> ggml_tensor_p:
+def _compute_nbytes(
+    ne: Tuple[int, int, int, int], type: ctypes.c_int
+) -> Tuple[int, int, int, int]:
+    nb0 = ggml_type_size(type)
+    nb1 = nb0 * (ne[0] // ggml_blck_size(type))
+    nb2 = nb1 * ne[1]
+    nb3 = nb2 * ne[2]
+    return (nb0, nb1, nb2, nb3)
+
+
+def from_numpy(
+    ctx: ggml_context_p, array: Union[np.ndarray, "torch.Tensor"]
+) -> ggml_tensor_p:
     if type(array).__name__ == "Tensor":
         array = array.numpy()
-    tensor_p = ggml_new_tensor(ctx, from_numpy_dtype(array.dtype), 1, GgmlShape())
+    # Create an empty tensor so we don't allocate memory for the data pointer
+    gtype = from_numpy_dtype(array.dtype)
+    tensor_p = ggml_new_tensor_1d(ctx, gtype, 0)
+    # Fill out the correct dimensions and shape.
     tensor_p.contents.n_dims = array.ndim
+    shape = _pad_shape(array.shape)
+    tensor_p.contents.ne = GgmlShape(*shape)
+    tensor_p.contents.nb = GgmlNBytes(*_compute_nbytes(shape, gtype))
+    # point the tensor data to the content of the numpy array.
     tensor_p.contents.data = array.ctypes.data_as(ctypes.c_void_p)
-    tensor_p.contents.ne = GgmlShape(*_pad_shape(array.shape))
     # print(f"array: {array.shape} @0x{array.ctypes.data_as(ctypes.c_void_p)}")
     # print(f"tensor_p: {shape(tensor_p)} @0x{tensor_p.contents.data:x}")
 
@@ -181,12 +208,14 @@ def GptVocab() -> NativeObj:
 def Fairseq2Model() -> NativeObj:
     return NativeObj("fairseq2_model")
 
+
 lib.std_string_alloc.argtypes = [ctypes.c_char_p]
 lib.std_string_alloc.restype = ctypes.c_void_p
 lib.std_string_free.argtypes = [ctypes.c_void_p]
 lib.std_string_free.restype = None
 NativeObj._cache["std_string"] = (lib.std_string_alloc, lib.std_string_free)
 
+
 @functools.lru_cache(1024)
 def CppStr(content: str) -> NativeObj:
     c_str = ctypes.create_string_buffer(content.encode("utf-8"))
@@ -196,7 +225,6 @@ def CppStr(content: str) -> NativeObj:
 
 lib.unity_model_load.argtypes = [ctypes.c_char_p, ctypes.c_void_p, ctypes.c_void_p]
 
-
 def unity_model_load(model_file: Path) -> Tuple[NativeObj, NativeObj]:
     model = UnityModel()
     vocab = GptVocab()
@@ -209,13 +237,15 @@ def unity_model_load(model_file: Path) -> Tuple[NativeObj, NativeObj]:
 
 
 lib.load_unity_ggml_file.argtypes = [ctypes.c_void_p, ctypes.c_char_p]
-lib.load_unity_ggml_file.restype = None
+lib.load_unity_ggml_file.restype = ctypes.c_int
 
 
 def load_unity_ggml_file(model_file: Path) -> NativeObj:
     model = Fairseq2Model()
     bytes_file = ctypes.create_string_buffer(str(model_file).encode("utf-8"))
-    lib.load_unity_ggml_file(model.ptr, bytes_file)
+    err = lib.load_unity_ggml_file(model.ptr, bytes_file)
+    if err:
+        raise Exception("Failed to load model")
     return model
 
 
@@ -242,7 +272,7 @@ def unity_eval(
     return lib.unity_eval(allocr, model.ptr, tensor, n_threads)
 
 
-_FORWARD_CACHE: Dict[str, Callable[[...], ggml_tensor_p]] = {}
+_FORWARD_CACHE: Dict[str, Callable[..., ggml_tensor_p]] = {}
 
 
 def forward(

+ 4 - 1
ggml/ggml_convert.py

@@ -94,7 +94,10 @@ def write_state_dict(out: BufferedWriter, state_dict: Dict[str, torch.Tensor]) -
     """
     for key, value in state_dict.items():
         write_string(out, key)
-        write_tensor(out, value)
+        if key.endswith(".bias") and value.ndim == 1:
+            # GGML broadcasting isn't as strong as numpy
+            value = value.reshape(1, -1)
+        write_tensor(out, value.contiguous())
 
 
 def write_string(out: BufferedWriter, value: str) -> None:

+ 36 - 11
ggml/test_unity_cpp.py

@@ -63,9 +63,18 @@ def test_shape_works(ctx: Ctx) -> None:
     assert ggml.shape(c) == (12, 22, 32)
 
 
-@pytest.mark.xfail(
-    reason="TODO: understand diff between ggml strides and numpy strides"
-)
+def test_nb_works(ctx: Ctx) -> None:
+    a = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 10)
+    assert ggml.nb(a) == (4, 40, 40, 40)
+
+    b = ggml.ggml_new_tensor_2d(ctx, ggml.GGML_TYPE_F16, 11, 21)
+    assert ggml.nb(b) == (2, 22, 462, 462)
+
+    c = ggml.ggml_new_tensor_3d(ctx, ggml.GGML_TYPE_F32, 12, 22, 32)
+    assert ggml.nb(c) == (4, 48, 1056, 33792)
+
+
+@pytest.mark.xfail(reason="TODO: fix strides")
 def test_strides_works(ctx: Ctx) -> None:
     a = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 10)
     assert ggml.strides(a) == np.ones((10,), dtype=np.float32).strides
@@ -81,21 +90,37 @@ def test_to_numpy_works_with_f32(ctx: Ctx) -> None:
     a = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 10)
     a = ggml.ggml_set_f32(a, 2.14)
     assert np.allclose(ggml.to_numpy(a), np.ones((10,)) * 2.14)
+
     b = ggml.ggml_new_tensor_2d(ctx, ggml.GGML_TYPE_F32, 11, 21)
-    assert np.allclose(ggml.to_numpy(b), np.zeros((11, 21)))
+    b = ggml.ggml_set_f32(b, 2.14)
+    assert np.allclose(ggml.to_numpy(b), np.ones((11, 21)) * 2.14)
+
     c = ggml.ggml_new_tensor_3d(ctx, ggml.GGML_TYPE_F32, 12, 22, 32)
-    assert np.allclose(ggml.to_numpy(c), np.zeros((12, 22, 32)))
+    c = ggml.ggml_set_f32(c, 2.14)
+    assert np.allclose(ggml.to_numpy(c), np.ones((12, 22, 32)) * 2.14)
 
 
 def test_from_numpy_works_with_f32(ctx: Ctx) -> None:
     a = np.random.normal(size=(10,)).astype(dtype=np.float32)
     ga = ggml.from_numpy(ctx, a)
+    assert ggml.shape(ga) == (10,)
+    assert ggml.nb(ga) == ggml.nb(ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 10))
     assert np.allclose(a, ggml.to_numpy(ga))
+
     a = np.random.normal(size=(11, 21)).astype(dtype=np.float32)
     ga = ggml.from_numpy(ctx, a)
+    assert ggml.shape(ga) == (11, 21)
+    assert ggml.nb(ga) == ggml.nb(
+        ggml.ggml_new_tensor_2d(ctx, ggml.GGML_TYPE_F32, 11, 21)
+    )
     assert np.allclose(a, ggml.to_numpy(ga))
+
     a = np.random.normal(size=(12, 22, 32)).astype(dtype=np.float32)
     ga = ggml.from_numpy(ctx, a)
+    assert ggml.shape(ga) == (12, 22, 32)
+    assert ggml.nb(ga) == ggml.nb(
+        ggml.ggml_new_tensor_3d(ctx, ggml.GGML_TYPE_F32, 12, 22, 32)
+    )
     assert np.allclose(a, ggml.to_numpy(ga))
 
 
@@ -163,10 +188,12 @@ def g_model() -> NativeObj:
 @pytest.fixture(scope="module")
 def pt_model() -> Iterator[Any]:
     model = load_unity_model("seamlessM4T_medium")
+    print(model)
     model.eval()
     with torch.inference_mode():
         yield model
 
+
 @pytest.mark.xfail(reason="TODO")
 def test_hparams_code_is_up_to_date() -> None:
     model_file = Path(__file__).parent / "seamlessM4T_medium.ggml"
@@ -177,8 +204,8 @@ def test_hparams_code_is_up_to_date() -> None:
     assert hparams_struct in actual_code
 
 
-def test_unity_ffn(ctx: Ctx, g_model: NativeObj, pt_model: Any) -> None:
-    x = torch.empty((1024,))
+def test_forward_ffn(ctx: Ctx, g_model: NativeObj, pt_model: Any) -> None:
+    x = torch.empty((1024))
     torch.nn.init.uniform_(x, -1, 1)
 
     # Test FFN without LayerNorm
@@ -196,15 +223,13 @@ def test_unity_ffn(ctx: Ctx, g_model: NativeObj, pt_model: Any) -> None:
     assert np.allclose(y_exp, y, rtol=1e-3)
 
 
-def test_unity_layer_norm(ctx: Ctx, g_model: NativeObj, pt_model: Any) -> None:
+def test_forward_layer_norm(ctx: Ctx, g_model: NativeObj, pt_model: Any) -> None:
     x = torch.empty((1024,))
     torch.nn.init.uniform_(x, -1, 1)
 
     y_exp = pt_model.text_encoder.layers[0].ffn_layer_norm(x).numpy()
     gx = ggml.from_numpy(ctx, x)
-    gy = ggml.forward(
-        "LayerNorm", g_model, "text_encoder.layers.0.ffn_layer_norm", gx
-    )
+    gy = ggml.forward("LayerNorm", g_model, "text_encoder.layers.0.ffn_layer_norm", gx)
     gf = ggml.ggml_build_forward(gy)
     ggml.ggml_graph_compute_with_ctx(ctx, ctypes.pointer(gf), 1)