2 years ago · 6cf3dfeb05
--- a/ggml/examples/unity/unity.cpp
+++ b/ggml/examples/unity/unity.cpp
@@ -469,9 +469,10 @@ extern "C" bool unity_model_load(const char* fname, unity_model& model, gpt_voca
 
				 }
			
 
				 
			
 
				 // build the computation graph
			
 
				-extern "C" struct ggml_cgraph * unity_graph(
			
 
				-        const unity_model & model,
			
 
				-        struct ggml_allocr * allocr) {
			
 
				+extern "C" ggml_cgraph* unity_graph(
			
 
				+    const unity_model & model,
			
 
				+    ggml_tensor* input
			
 
				+) {
			
 
				 
			
 
				     const auto & hparams = model.hparams;
			
 
				 
			
@@ -494,23 +495,11 @@ extern "C" struct ggml_cgraph * unity_graph(
 
				     struct ggml_context * ctx0 = ggml_init(params);
			
 
				 
			
 
				     struct ggml_cgraph  * gf = ggml_new_graph(ctx0);
			
 
				-    
			
 
				-    /// For dev, load an example input before conformer blocks
			
 
				-    auto file = std::ifstream("/private/home/dnn/internal_sc/seamless_communication/ggml/examples/unity/dev/seqs_before_conformer_block.bin", std::ios::binary);
			
 
				-    if (!file) {
			
 
				-        file = std::ifstream("/home/guw/github/seamless_communication/ggml/examples/unity/models/unity-large/seqs_before_conformer_block.bin", std::ios::binary);
			
 
				-        if (!file) {
			
 
				-            std::cerr << "Failed to open binary file." << std::endl;
			
 
				-            exit(1);
			
 
				-        }
			
 
				-    }
			
 
				-    struct ggml_tensor * inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1024, 137);
			
 
				-    inpL->data = malloc(ggml_nbytes(inpL));
			
 
				-    file.read(reinterpret_cast<char *>(inpL->data), ggml_nbytes(inpL));
			
 
				     struct ggml_tensor * ffn_scale = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, 1);
			
 
				     ffn_scale->data = malloc(ggml_nbytes(ffn_scale));
			
 
				     ggml_set_f32(ffn_scale, 0.5f);
			
 
				     
			
 
				+    ggml_tensor* inpL = input;
			
 
				     for (int il = 0; il < n_audio_enc_layer; ++il) {
			
 
				         struct ggml_tensor * cur = inpL;
			
 
				         struct ggml_tensor * residual = cur;
			
@@ -603,9 +592,10 @@ extern "C" struct ggml_cgraph * unity_graph(
 
				     return gf;
			
 
				 }
			
 
				 
			
 
				-extern "C" struct ggml_cgraph * unity_eval(
			
 
				-        const unity_model & model,
			
 
				-        struct ggml_allocr * allocr,
			
 
				+extern "C" struct ggml_cgraph* unity_eval(
			
 
				+        ggml_allocr* allocr,
			
 
				+        const unity_model& model,
			
 
				+        ggml_tensor* input,
			
 
				         const int n_threads) {
			
 
				 
			
 
				     // const auto & hparams = model.hparams;
			
@@ -613,7 +603,7 @@ extern "C" struct ggml_cgraph * unity_eval(
 
				     // reset the allocator to free all the memory allocated during the previous inference
			
 
				     ggml_allocr_reset(allocr);
			
 
				 
			
 
				-    struct ggml_cgraph * gf = unity_graph(model, allocr);
			
 
				+    struct ggml_cgraph * gf = unity_graph(model, input);
			
 
				 
			
 
				     // allocate tensors
			
 
				     ggml_allocr_alloc_graph(allocr, gf);
			
@@ -669,14 +659,26 @@ int main(int argc, char ** argv) {
 
				         }
			
 
				     }
			
 
				 
			
 
				+    /// For dev, load an example input before conformer blocks
			
 
				+    auto file = std::ifstream("/private/home/dnn/internal_sc/seamless_communication/ggml/examples/unity/dev/seqs_before_conformer_block.bin", std::ios::binary);
			
 
				+    if (!file) {
			
 
				+        file = std::ifstream("/home/guw/github/seamless_communication/ggml/examples/unity/models/unity-large/seqs_before_conformer_block.bin", std::ios::binary);
			
 
				+        if (!file) {
			
 
				+            std::cerr << "Failed to open binary file." << std::endl;
			
 
				+            exit(1);
			
 
				+        }
			
 
				+    }
			
 
				+    struct ggml_tensor * input = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, 1024, 137);
			
 
				+    input->data = malloc(ggml_nbytes(input));
			
 
				+    file.read(reinterpret_cast<char *>(input->data), ggml_nbytes(input));
			
 
				+
			
 
				     // keep this buffer alive while evaluating the model
			
 
				     std::vector<uint8_t> compute_buffer;
			
 
				     struct ggml_allocr * allocr = NULL;
			
 
				     // allocate the compute buffer
			
 
				     {
			
 
				         allocr = ggml_allocr_new_measure(GGML_MEM_ALIGN);
			
 
				-        struct ggml_cgraph * gf = unity_graph(model, allocr);
			
 
				-        
			
 
				+        struct ggml_cgraph * gf = unity_graph(model, input);
			
 
				 
			
 
				         // compute the required memory
			
 
				         size_t mem_size = ggml_allocr_alloc_graph(allocr, gf) + GGML_MEM_ALIGN;
			
@@ -689,7 +691,7 @@ int main(int argc, char ** argv) {
 
				         fprintf(stderr, "%s: compute buffer size: %.2f MB\n", __func__, mem_size/1024.0/1024.0);
			
 
				     }
			
 
				 
			
 
				-    if (!unity_eval(model, allocr, 1)) {
			
 
				+    if (!unity_eval(allocr, model, input, 1)) {
			
 
				         printf("Failed to predict\n");
			
 
				         return 1;
			
 
				     }
			
--- a/ggml/examples/unity/unity_model_loader.h
+++ b/ggml/examples/unity/unity_model_loader.h
@@ -216,6 +216,8 @@ void init_attention_head(
 
				     init_ffn_layer(head->ffn, model_ctx, prefix + ".ffn");
			
 
				 }
			
 
				 
			
 
				+// TODO: attention_head_compute_graph
			
 
				+
			
 
				 // Text Decoder
			
 
				 
			
 
				 struct text_decoder {
			
--- a/ggml/ggml.py
+++ b/ggml/ggml.py
@@ -78,15 +78,33 @@ def to_numpy(tensor: Union[ggml_tensor, ggml_tensor_p]) -> np.ndarray:
 
				 GgmlShape = ctypes.c_int64 * GGML_MAX_DIMS
			
 
				 
			
 
				 
			
 
				+def from_file(
			
 
				+    ctx: ggml_context_p, file: Path, shape: Tuple[int, ...], dtype: type = np.float32
			
 
				+) -> ggml_tensor_p:
			
 
				+    data = np.fromfile(str(file), dtype=dtype).reshape(shape)  # type: ignore
			
 
				+    return from_numpy(ctx, data)
			
 
				+
			
 
				+
			
 
				+def _pad_shape(shape: Tuple[int, ...]) -> Tuple[int, int, int, int]:
			
 
				+    if len(shape) >= 4:
			
 
				+        return shape
			
 
				+
			
 
				+    padding = (1,) * (4 - len(shape))
			
 
				+    return shape + padding  # type: ignore
			
 
				+
			
 
				+
			
 
				 def from_numpy(ctx: ggml_context_p, array: np.ndarray) -> ggml_tensor_p:
			
 
				     tensor_p = ggml_new_tensor(
			
 
				-        ctx, from_numpy_dtype(array.dtype), 1, GgmlShape(0, 0, 0, 0)
			
 
				+        ctx, from_numpy_dtype(array.dtype), 1, GgmlShape()
			
 
				     )
			
 
				     tensor_p.contents.n_dims = array.ndim
			
 
				     tensor_p.contents.data = array.ctypes.data_as(ctypes.c_void_p)
			
 
				-    tensor_p.contents.ne = GgmlShape(*array.shape)
			
 
				+    tensor_p.contents.ne = GgmlShape(*_pad_shape(array.shape))
			
 
				     # print(f"array: {array.shape} @0x{array.ctypes.data_as(ctypes.c_void_p)}")
			
 
				     # print(f"tensor_p: {shape(tensor_p)} @0x{tensor_p.contents.data:x}")
			
 
				+
			
 
				+    # prevent the underlying numpy array to be freed
			
 
				+    setattr(tensor_p, "__data", array)
			
 
				     return tensor_p
			
 
				 
			
 
				 
			
@@ -142,7 +160,9 @@ def MeasureArena() -> NativeObj:
 
				 
			
 
				 def FixedSizeArena(mem_size: int) -> NativeObj:
			
 
				     memory = torch.zeros(mem_size, dtype=torch.uint8)
			
 
				-    allocr = ggml_allocr_new(ctypes.c_void_p(memory.data_ptr()), mem_size, GGML_MEM_ALIGN)
			
 
				+    allocr = ggml_allocr_new(
			
 
				+        ctypes.c_void_p(memory.data_ptr()), mem_size, GGML_MEM_ALIGN
			
 
				+    )
			
 
				     arena = NativeObj("ggml_allocr", allocr)
			
 
				     # Add a reference from the arena object to the underlying tensor, otherwise it will be freed to early.
			
 
				     setattr(arena, "__memory", memory)
			
@@ -157,7 +177,6 @@ def GptVocab() -> NativeObj:
 
				     return NativeObj("gpt_vocab")
			
 
				 
			
 
				 
			
 
				-
			
 
				 lib.unity_model_load.argtypes = [ctypes.c_char_p, ctypes.c_void_p, ctypes.c_void_p]
			
 
				 
			
 
				 
			
@@ -176,13 +195,20 @@ lib.unity_graph.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
 
				 lib.unity_graph.restype = ctypes.POINTER(ggml_cgraph)
			
 
				 
			
 
				 
			
 
				-def unity_graph(model: NativeObj, allocr: NativeObj) -> ggml_cgraph_p:
			
 
				-    return lib.unity_graph(model.ptr, allocr.ptr)  # type: ignore
			
 
				+def unity_graph(model: NativeObj, tensor: ggml_tensor_p) -> ggml_cgraph_p:
			
 
				+    return lib.unity_graph(model.ptr, tensor)  # type: ignore
			
 
				 
			
 
				 
			
 
				-lib.unity_eval.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
			
 
				+lib.unity_eval.argtypes = [
			
 
				+    ctypes.c_void_p,
			
 
				+    ctypes.c_void_p,
			
 
				+    ctypes.POINTER(ggml_tensor),
			
 
				+    ctypes.c_int,
			
 
				+]
			
 
				 lib.unity_eval.restype = ctypes.POINTER(ggml_cgraph)
			
 
				 
			
 
				 
			
 
				-def unity_eval(model: NativeObj, allocr: NativeObj, n_threads: int) -> ggml_cgraph_p:
			
 
				-    return lib.unity_eval(model.ptr, allocr.ptr, n_threads)
			
 
				+def unity_eval(
			
 
				+    allocr: NativeObj, model: NativeObj, tensor: ggml_tensor_p, n_threads: int
			
 
				+) -> ggml_cgraph_p:
			
 
				+    return lib.unity_eval(allocr.ptr, model.ptr, tensor, n_threads)
			
--- a/ggml/test_unity_cpp.py
+++ b/ggml/test_unity_cpp.py
@@ -3,11 +3,13 @@ import ctypes
 
				 import torch
			
 
				 import pytest
			
 
				 import numpy as np
			
 
				+from pathlib import Path
			
 
				 from typing import Iterator
			
 
				 from ggml import NativeObj
			
 
				 
			
 
				 Ctx = ggml.ggml_context_p
			
 
				 
			
 
				+UNITY_MODELS = Path(__file__).parent / "examples/unity/models"
			
 
				 PARAMS_16MB = ggml.ggml_init_params(mem_size=16 * 1024 * 1024, mem_buffer=None)
			
 
				 
			
 
				 
			
@@ -120,25 +122,26 @@ def test_from_numpy_works_with_f16(ctx: Ctx) -> None:
 
				     assert np.allclose(a, ggml.to_numpy(ga))
			
 
				 
			
 
				 
			
 
				-def test_unity_model_load() -> None:
			
 
				+def test_unity_model_load(ctx: Ctx) -> None:
			
 
				     model, vocab = ggml.unity_model_load(
			
 
				-        "examples/unity/models/unity-large/ggml-model.bin"
			
 
				+        UNITY_MODELS / "unity-large/ggml-model.bin"
			
 
				     )
			
 
				     print(model, vocab)
			
 
				+
			
 
				+    example = ggml.from_file(ctx, UNITY_MODELS / "unity-large/seqs_before_conformer_block.bin", (1024, 137))
			
 
				+
			
 
				     with ggml.MeasureArena() as arena:
			
 
				-        # compute graph
			
 
				-        graph = ggml.unity_graph(model, arena)
			
 
				-        # required memory
			
 
				-        # TODO: why the extra padding ?
			
 
				+        graph = ggml.unity_graph(model, example)
			
 
				+        # TODO: why the extra memory ?
			
 
				         mem_size = ggml.ggml_allocr_alloc_graph(arena.ptr, graph) + ggml.GGML_MEM_ALIGN
			
 
				 
			
 
				-    compute_buffer = torch.zeros(mem_size, dtype=torch.uint8)
			
 
				     with ggml.FixedSizeArena(mem_size) as allocr:
			
 
				         print(f"unity_graph: compute buffer size: {mem_size/1024/1024} MB")
			
 
				 
			
 
				-        eval_res_ptr = ggml.unity_eval(model, allocr, 1)
			
 
				+        eval_res_ptr = ggml.unity_eval(allocr, model, example, 1)
			
 
				         eval_res = eval_res_ptr.contents
			
 
				         inpL = ggml.to_numpy(eval_res.nodes[eval_res.n_nodes - 1])
			
 
				         expected_raw = "-0.1308,0.0346,-0.2656,0.2873,-0.0104,0.0574,0.4033,-0.1125,-0.0460,-0.0496"
			
 
				         expected = map(float, expected_raw.split(","))
			
 
				         assert np.allclose(inpL[0, :10], list(expected), atol=1e-4)
			
 
				+