2 gadi atpakaļ · c2e6384e29
--- a/ggml/examples/unity/fairseq2.cpp
+++ b/ggml/examples/unity/fairseq2.cpp
@@ -50,15 +50,21 @@ void Linear_init(
 
				     }
			
 
				 }
			
 
				 
			
 
				-extern "C" ggml_tensor* Linear_forward(
			
 
				+extern "C" ggml_tensor*
			
 
				+Linear_forward(
			
 
				     fairseq2_model& model,
			
 
				     const std::string &prefix,
			
 
				-    ggml_tensor* input
			
 
				+    ggml_tensor* input  // (d_in)
			
 
				 ) {
			
 
				-    ggml_tensor* weight = model.tensors[prefix + ".weight"];
			
 
				-    ggml_tensor* bias = model.tensors[prefix + ".bias"];
			
 
				+    // Note: for now we assumed un-batched input
			
 
				+    ggml_tensor* weight = model.tensors[prefix + ".weight"];  // (d_in, d_out)
			
 
				+    ggml_tensor* bias = model.tensors[prefix + ".bias"];  // (d_out)
			
 
				 
			
 
				-    return ggml_add(model.ctx, ggml_mul_mat(model.ctx, weight, input), bias);
			
 
				+    return ggml_add(
			
 
				+        model.ctx,
			
 
				+        ggml_mul_mat(model.ctx, weight, input),  // (d_out)
			
 
				+        bias
			
 
				+    );
			
 
				 }
			
 
				 
			
 
				 // LayerNorm
			
--- a/ggml/examples/unity/model_loader.cpp
+++ b/ggml/examples/unity/model_loader.cpp
@@ -21,7 +21,7 @@ std::ifstream open_ggml_file(const char* fname) {
 
				     return fin;
			
 
				 }
			
 
				 
			
 
				-void
			
 
				+int
			
 
				 model_loader::load_model_weights(fairseq2_model &model, std::ifstream &fin)
			
 
				 {
			
 
				     size_t total_size = 0;
			
@@ -30,6 +30,11 @@ model_loader::load_model_weights(fairseq2_model &model, std::ifstream &fin)
 
				         if (name.length() == 0)
			
 
				             break;
			
 
				         auto tensor = load_tensor_value(fin, model.ctx);
			
 
				+        if (tensor == nullptr) {
			
 
				+            // Abort in case of error, the input stream is corrupted at this point.
			
 
				+            printf("Error while reading tensor %s\n", name.c_str() );
			
 
				+            return 1;
			
 
				+        }
			
 
				         model.tensors[name] = tensor;
			
 
				         if (DEBUG_MODEL_LOAD) {
			
 
				             printf("%s [%5ld, %5ld], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), tensor->ne[0], tensor->ne[1], ggml_type_name(tensor->type), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
			
@@ -38,21 +43,26 @@ model_loader::load_model_weights(fairseq2_model &model, std::ifstream &fin)
 
				     }
			
 
				 
			
 
				     printf("%s: model size  = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
			
 
				+    return 0;
			
 
				 };
			
 
				 
			
 
				 ggml_tensor* load_tensor_value(std::ifstream &fin, ggml_context* ctx)
			
 
				 {
			
 
				-    int32_t n_dims;
			
 
				-    int32_t raw_type;
			
 
				+    int32_t n_dims = 0;
			
 
				+    int32_t raw_type = 0;
			
 
				 
			
 
				     fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
			
 
				     fin.read(reinterpret_cast<char *>(&raw_type),  sizeof(raw_type));
			
 
				     ggml_type type = ggml_type(raw_type);
			
 
				 
			
 
				+    if (n_dims <= 0 || n_dims > GGML_MAX_DIMS || raw_type < 0 || raw_type > GGML_TYPE_COUNT) {
			
 
				+        return nullptr;
			
 
				+    }
			
 
				     int64_t ne[4] = {1, 1, 1, 1};
			
 
				     for (int i = 0; i < n_dims; ++i) {
			
 
				         fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
			
 
				     }
			
 
				+
			
 
				     ggml_tensor* tensor = ggml_new_tensor(ctx, type, n_dims, ne);
			
 
				     fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
			
 
				     return tensor;
			
--- a/ggml/examples/unity/model_loader.h
+++ b/ggml/examples/unity/model_loader.h
@@ -27,7 +27,7 @@ public:
 
				 
			
 
				     virtual void tensors_alloc(fairseq2_model& model) = 0;
			
 
				 
			
 
				-    void load_model_weights(fairseq2_model &model, std::ifstream &fin);
			
 
				+    int load_model_weights(fairseq2_model &model, std::ifstream &fin);
			
 
				 
			
 
				 private:
			
 
				     ggml_tensor * next_tensor(std::ifstream &fin, fairseq2_model &model);
			
@@ -40,7 +40,7 @@ ggml_tensor* load_tensor_value(std::ifstream &fin, ggml_context* ctx);
 
				 std::ifstream open_ggml_file(const char* fname);
			
 
				 
			
 
				 template<typename T>
			
 
				-void load_fairseq2_ggml_file(fairseq2_model& model, const char* fname) {
			
 
				+int load_fairseq2_ggml_file(fairseq2_model& model, const char* fname) {
			
 
				     T loader;
			
 
				     auto fin = open_ggml_file(fname);
			
 
				     loader.load_hparams(model, fin);
			
@@ -53,6 +53,6 @@ void load_fairseq2_ggml_file(fairseq2_model& model, const char* fname) {
 
				     };
			
 
				     model.ctx = ggml_init(params);
			
 
				 
			
 
				-    loader.load_model_weights(model, fin);;
			
 
				+    return loader.load_model_weights(model, fin);;
			
 
				 }
			
 
				 
			
--- a/ggml/examples/unity/unity.cpp
+++ b/ggml/examples/unity/unity.cpp
@@ -415,7 +415,7 @@ extern "C" bool unity_model_load(const char* fname, unity_model& model, gpt_voca
 
				             std::string name(length, 0);
			
 
				             fin.read(&name[0], length);
			
 
				 
			
 
				-            std::cout << "loading " << name << " " << n_dims << std::endl;
			
 
				+            // std::cout << "loading " << name << " " << n_dims << std::endl;
			
 
				 
			
 
				             if (model.tensors.find(name) == model.tensors.end()) {
			
 
				                 fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str());
			
--- a/ggml/examples/unity/unity_model_loader.cpp
+++ b/ggml/examples/unity/unity_model_loader.cpp
@@ -68,7 +68,7 @@ void unity_model_loader::tensors_alloc(fairseq2_model &model)
 
				     // tensor_map["text_decoder.layer_norm.bias"] = arch.layer_norm_b;
			
 
				 };
			
 
				 
			
 
				-extern "C" void load_unity_ggml_file(fairseq2_model& model, const char* fname) {
			
 
				+extern "C" int load_unity_ggml_file(fairseq2_model& model, const char* fname) {
			
 
				     return load_fairseq2_ggml_file<unity_model_loader>(model, fname);
			
 
				 }
			
 
				 
			
--- a/ggml/ggml.py
+++ b/ggml/ggml.py
@@ -8,7 +8,6 @@ import ctypes
 
				 import torch
			
 
				 import functools
			
 
				 from pathlib import Path
			
 
				-from typing import Self
			
 
				 from typing import Dict
			
 
				 from typing import Callable
			
 
				 from typing import Any
			
@@ -48,11 +47,20 @@ def shape(tensor: Union[ggml_tensor, ggml_tensor_p]) -> Tuple[int, ...]:
 
				     return tuple([tensor.ne[i] for i in range(ndims)])
			
 
				 
			
 
				 
			
 
				+def nb(tensor: Union[ggml_tensor, ggml_tensor_p]) -> Tuple[int, ...]:
			
 
				+    if isinstance(tensor, ctypes._Pointer):
			
 
				+        tensor = tensor.contents
			
 
				+    return tuple([tensor.nb[i] for i in range(4)])
			
 
				+
			
 
				+
			
 
				 def strides(tensor: Union[ggml_tensor, ggml_tensor_p]) -> Tuple[int, ...]:
			
 
				+    raise NotImplementedError()
			
 
				     if isinstance(tensor, ctypes._Pointer):
			
 
				         tensor = tensor.contents
			
 
				     ndims = tensor.n_dims
			
 
				-    return tuple([tensor.nb[i] for i in range(ndims)])
			
 
				+    num_bytes = tuple([tensor.nb[i] for i in range(ndims)])
			
 
				+    # TODO: convert to numpy strides
			
 
				+    return num_bytes
			
 
				 
			
 
				 
			
 
				 def to_numpy(tensor: Union[ggml_tensor, ggml_tensor_p]) -> np.ndarray:
			
@@ -77,6 +85,7 @@ def to_numpy(tensor: Union[ggml_tensor, ggml_tensor_p]) -> np.ndarray:
 
				 
			
 
				 
			
 
				 GgmlShape = ctypes.c_int64 * GGML_MAX_DIMS
			
 
				+GgmlNBytes = ctypes.c_uint64 * GGML_MAX_DIMS
			
 
				 
			
 
				 
			
 
				 def from_file(
			
@@ -88,19 +97,37 @@ def from_file(
 
				 
			
 
				 def _pad_shape(shape: Tuple[int, ...]) -> Tuple[int, int, int, int]:
			
 
				     if len(shape) >= 4:
			
 
				-        return shape
			
 
				+        return shape  # type: ignore
			
 
				 
			
 
				     padding = (1,) * (4 - len(shape))
			
 
				     return shape + padding  # type: ignore
			
 
				 
			
 
				 
			
 
				-def from_numpy(ctx: ggml_context_p, array: Union[np.ndarray, "torch.Tensor"]) -> ggml_tensor_p:
			
 
				+def _compute_nbytes(
			
 
				+    ne: Tuple[int, int, int, int], type: ctypes.c_int
			
 
				+) -> Tuple[int, int, int, int]:
			
 
				+    nb0 = ggml_type_size(type)
			
 
				+    nb1 = nb0 * (ne[0] // ggml_blck_size(type))
			
 
				+    nb2 = nb1 * ne[1]
			
 
				+    nb3 = nb2 * ne[2]
			
 
				+    return (nb0, nb1, nb2, nb3)
			
 
				+
			
 
				+
			
 
				+def from_numpy(
			
 
				+    ctx: ggml_context_p, array: Union[np.ndarray, "torch.Tensor"]
			
 
				+) -> ggml_tensor_p:
			
 
				     if type(array).__name__ == "Tensor":
			
 
				         array = array.numpy()
			
 
				-    tensor_p = ggml_new_tensor(ctx, from_numpy_dtype(array.dtype), 1, GgmlShape())
			
 
				+    # Create an empty tensor so we don't allocate memory for the data pointer
			
 
				+    gtype = from_numpy_dtype(array.dtype)
			
 
				+    tensor_p = ggml_new_tensor_1d(ctx, gtype, 0)
			
 
				+    # Fill out the correct dimensions and shape.
			
 
				     tensor_p.contents.n_dims = array.ndim
			
 
				+    shape = _pad_shape(array.shape)
			
 
				+    tensor_p.contents.ne = GgmlShape(*shape)
			
 
				+    tensor_p.contents.nb = GgmlNBytes(*_compute_nbytes(shape, gtype))
			
 
				+    # point the tensor data to the content of the numpy array.
			
 
				     tensor_p.contents.data = array.ctypes.data_as(ctypes.c_void_p)
			
 
				-    tensor_p.contents.ne = GgmlShape(*_pad_shape(array.shape))
			
 
				     # print(f"array: {array.shape} @0x{array.ctypes.data_as(ctypes.c_void_p)}")
			
 
				     # print(f"tensor_p: {shape(tensor_p)} @0x{tensor_p.contents.data:x}")
			
 
				 
			
@@ -181,12 +208,14 @@ def GptVocab() -> NativeObj:
 
				 def Fairseq2Model() -> NativeObj:
			
 
				     return NativeObj("fairseq2_model")
			
 
				 
			
 
				+
			
 
				 lib.std_string_alloc.argtypes = [ctypes.c_char_p]
			
 
				 lib.std_string_alloc.restype = ctypes.c_void_p
			
 
				 lib.std_string_free.argtypes = [ctypes.c_void_p]
			
 
				 lib.std_string_free.restype = None
			
 
				 NativeObj._cache["std_string"] = (lib.std_string_alloc, lib.std_string_free)
			
 
				 
			
 
				+
			
 
				 @functools.lru_cache(1024)
			
 
				 def CppStr(content: str) -> NativeObj:
			
 
				     c_str = ctypes.create_string_buffer(content.encode("utf-8"))
			
@@ -196,7 +225,6 @@ def CppStr(content: str) -> NativeObj:
 
				 
			
 
				 lib.unity_model_load.argtypes = [ctypes.c_char_p, ctypes.c_void_p, ctypes.c_void_p]
			
 
				 
			
 
				-
			
 
				 def unity_model_load(model_file: Path) -> Tuple[NativeObj, NativeObj]:
			
 
				     model = UnityModel()
			
 
				     vocab = GptVocab()
			
@@ -209,13 +237,15 @@ def unity_model_load(model_file: Path) -> Tuple[NativeObj, NativeObj]:
 
				 
			
 
				 
			
 
				 lib.load_unity_ggml_file.argtypes = [ctypes.c_void_p, ctypes.c_char_p]
			
 
				-lib.load_unity_ggml_file.restype = None
			
 
				+lib.load_unity_ggml_file.restype = ctypes.c_int
			
 
				 
			
 
				 
			
 
				 def load_unity_ggml_file(model_file: Path) -> NativeObj:
			
 
				     model = Fairseq2Model()
			
 
				     bytes_file = ctypes.create_string_buffer(str(model_file).encode("utf-8"))
			
 
				-    lib.load_unity_ggml_file(model.ptr, bytes_file)
			
 
				+    err = lib.load_unity_ggml_file(model.ptr, bytes_file)
			
 
				+    if err:
			
 
				+        raise Exception("Failed to load model")
			
 
				     return model
			
 
				 
			
 
				 
			
@@ -242,7 +272,7 @@ def unity_eval(
 
				     return lib.unity_eval(allocr, model.ptr, tensor, n_threads)
			
 
				 
			
 
				 
			
 
				-_FORWARD_CACHE: Dict[str, Callable[[...], ggml_tensor_p]] = {}
			
 
				+_FORWARD_CACHE: Dict[str, Callable[..., ggml_tensor_p]] = {}
			
 
				 
			
 
				 
			
 
				 def forward(
			
--- a/ggml/ggml_convert.py
+++ b/ggml/ggml_convert.py
@@ -94,7 +94,10 @@ def write_state_dict(out: BufferedWriter, state_dict: Dict[str, torch.Tensor]) -
 
				     """
			
 
				     for key, value in state_dict.items():
			
 
				         write_string(out, key)
			
 
				-        write_tensor(out, value)
			
 
				+        if key.endswith(".bias") and value.ndim == 1:
			
 
				+            # GGML broadcasting isn't as strong as numpy
			
 
				+            value = value.reshape(1, -1)
			
 
				+        write_tensor(out, value.contiguous())
			
 
				 
			
 
				 
			
 
				 def write_string(out: BufferedWriter, value: str) -> None:
			
--- a/ggml/test_unity_cpp.py
+++ b/ggml/test_unity_cpp.py
@@ -63,9 +63,18 @@ def test_shape_works(ctx: Ctx) -> None:
 
				     assert ggml.shape(c) == (12, 22, 32)
			
 
				 
			
 
				 
			
 
				-@pytest.mark.xfail(
			
 
				-    reason="TODO: understand diff between ggml strides and numpy strides"
			
 
				-)
			
 
				+def test_nb_works(ctx: Ctx) -> None:
			
 
				+    a = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 10)
			
 
				+    assert ggml.nb(a) == (4, 40, 40, 40)
			
 
				+
			
 
				+    b = ggml.ggml_new_tensor_2d(ctx, ggml.GGML_TYPE_F16, 11, 21)
			
 
				+    assert ggml.nb(b) == (2, 22, 462, 462)
			
 
				+
			
 
				+    c = ggml.ggml_new_tensor_3d(ctx, ggml.GGML_TYPE_F32, 12, 22, 32)
			
 
				+    assert ggml.nb(c) == (4, 48, 1056, 33792)
			
 
				+
			
 
				+
			
 
				+@pytest.mark.xfail(reason="TODO: fix strides")
			
 
				 def test_strides_works(ctx: Ctx) -> None:
			
 
				     a = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 10)
			
 
				     assert ggml.strides(a) == np.ones((10,), dtype=np.float32).strides
			
@@ -81,21 +90,37 @@ def test_to_numpy_works_with_f32(ctx: Ctx) -> None:
 
				     a = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 10)
			
 
				     a = ggml.ggml_set_f32(a, 2.14)
			
 
				     assert np.allclose(ggml.to_numpy(a), np.ones((10,)) * 2.14)
			
 
				+
			
 
				     b = ggml.ggml_new_tensor_2d(ctx, ggml.GGML_TYPE_F32, 11, 21)
			
 
				-    assert np.allclose(ggml.to_numpy(b), np.zeros((11, 21)))
			
 
				+    b = ggml.ggml_set_f32(b, 2.14)
			
 
				+    assert np.allclose(ggml.to_numpy(b), np.ones((11, 21)) * 2.14)
			
 
				+
			
 
				     c = ggml.ggml_new_tensor_3d(ctx, ggml.GGML_TYPE_F32, 12, 22, 32)
			
 
				-    assert np.allclose(ggml.to_numpy(c), np.zeros((12, 22, 32)))
			
 
				+    c = ggml.ggml_set_f32(c, 2.14)
			
 
				+    assert np.allclose(ggml.to_numpy(c), np.ones((12, 22, 32)) * 2.14)
			
 
				 
			
 
				 
			
 
				 def test_from_numpy_works_with_f32(ctx: Ctx) -> None:
			
 
				     a = np.random.normal(size=(10,)).astype(dtype=np.float32)
			
 
				     ga = ggml.from_numpy(ctx, a)
			
 
				+    assert ggml.shape(ga) == (10,)
			
 
				+    assert ggml.nb(ga) == ggml.nb(ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 10))
			
 
				     assert np.allclose(a, ggml.to_numpy(ga))
			
 
				+
			
 
				     a = np.random.normal(size=(11, 21)).astype(dtype=np.float32)
			
 
				     ga = ggml.from_numpy(ctx, a)
			
 
				+    assert ggml.shape(ga) == (11, 21)
			
 
				+    assert ggml.nb(ga) == ggml.nb(
			
 
				+        ggml.ggml_new_tensor_2d(ctx, ggml.GGML_TYPE_F32, 11, 21)
			
 
				+    )
			
 
				     assert np.allclose(a, ggml.to_numpy(ga))
			
 
				+
			
 
				     a = np.random.normal(size=(12, 22, 32)).astype(dtype=np.float32)
			
 
				     ga = ggml.from_numpy(ctx, a)
			
 
				+    assert ggml.shape(ga) == (12, 22, 32)
			
 
				+    assert ggml.nb(ga) == ggml.nb(
			
 
				+        ggml.ggml_new_tensor_3d(ctx, ggml.GGML_TYPE_F32, 12, 22, 32)
			
 
				+    )
			
 
				     assert np.allclose(a, ggml.to_numpy(ga))
			
 
				 
			
 
				 
			
@@ -163,10 +188,12 @@ def g_model() -> NativeObj:
 
				 @pytest.fixture(scope="module")
			
 
				 def pt_model() -> Iterator[Any]:
			
 
				     model = load_unity_model("seamlessM4T_medium")
			
 
				+    print(model)
			
 
				     model.eval()
			
 
				     with torch.inference_mode():
			
 
				         yield model
			
 
				 
			
 
				+
			
 
				 @pytest.mark.xfail(reason="TODO")
			
 
				 def test_hparams_code_is_up_to_date() -> None:
			
 
				     model_file = Path(__file__).parent / "seamlessM4T_medium.ggml"
			
@@ -177,8 +204,8 @@ def test_hparams_code_is_up_to_date() -> None:
 
				     assert hparams_struct in actual_code
			
 
				 
			
 
				 
			
 
				-def test_unity_ffn(ctx: Ctx, g_model: NativeObj, pt_model: Any) -> None:
			
 
				-    x = torch.empty((1024,))
			
 
				+def test_forward_ffn(ctx: Ctx, g_model: NativeObj, pt_model: Any) -> None:
			
 
				+    x = torch.empty((1024))
			
 
				     torch.nn.init.uniform_(x, -1, 1)
			
 
				 
			
 
				     # Test FFN without LayerNorm
			
@@ -196,15 +223,13 @@ def test_unity_ffn(ctx: Ctx, g_model: NativeObj, pt_model: Any) -> None:
 
				     assert np.allclose(y_exp, y, rtol=1e-3)
			
 
				 
			
 
				 
			
 
				-def test_unity_layer_norm(ctx: Ctx, g_model: NativeObj, pt_model: Any) -> None:
			
 
				+def test_forward_layer_norm(ctx: Ctx, g_model: NativeObj, pt_model: Any) -> None:
			
 
				     x = torch.empty((1024,))
			
 
				     torch.nn.init.uniform_(x, -1, 1)
			
 
				 
			
 
				     y_exp = pt_model.text_encoder.layers[0].ffn_layer_norm(x).numpy()
			
 
				     gx = ggml.from_numpy(ctx, x)
			
 
				-    gy = ggml.forward(
			
 
				-        "LayerNorm", g_model, "text_encoder.layers.0.ffn_layer_norm", gx
			
 
				-    )
			
 
				+    gy = ggml.forward("LayerNorm", g_model, "text_encoder.layers.0.ffn_layer_norm", gx)
			
 
				     gf = ggml.ggml_build_forward(gy)
			
 
				     ggml.ggml_graph_compute_with_ctx(ctx, ctypes.pointer(gf), 1)