1 year ago · 07f9a736fe
--- a/ggml/examples/unity/unity.cpp
+++ b/ggml/examples/unity/unity.cpp
@@ -17,6 +17,7 @@
 
															 #include <iostream>
														
 
															 #include <sndfile.h>
														
 
															 #include <cstdlib>
														
 
															+#include "ggml-alloc.h"
														
 
															 struct unity_params {
														
 
															     int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
														
@@ -111,7 +112,7 @@ Hypothesis* unity_decode(
 
															         /*eos_idx*/model.vocab.token_to_id["</s>"],
														
 
															         /*num_threads*/n_threads,
														
 
															     };
														
 
															-    struct ggml_tensor * prefix_seq = ggml_new_tensor_1d(model.ctx, GGML_TYPE_I32, 2);
														
 
															+    FORCE_ALLOC(prefix_seq, model.ctx, ggml_new_tensor_1d(model.ctx, GGML_TYPE_I32, 2));
														
 
															     ((int *)prefix_seq->data)[0]  = job.eos_idx;
														
 
															     ((int *)prefix_seq->data)[1]  = tgt_lang_idx;
														
 
															     job.prefix_seq = prefix_seq;
														
@@ -133,13 +134,13 @@ int main(int argc, char ** argv) {
 
															         fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
														
 
															         return 1;
														
 
															     }
														
 
															-    int ctx_size_gb = 20;
														
 
															-    if (model.hparams["w2v2_encoder_config__num_encoder_layers"] == 24) {
														
 
															-        ctx_size_gb = 40;
														
 
															-    } 
														
 
															+    // The ctx_size_mb mostly depends of input length and model dim.
														
 
															+    int ctx_size_mb = 128;
														
 
															+    auto encoder_buf = std::vector<uint8_t>(128 * 1024 * 1024);
														
 
															+    auto encoder_fwd_buf = std::vector<uint8_t>(ctx_size_mb * 1024 * 1024);
														
 
															+    ggml_allocr* fwd_alloc = ggml_allocr_new(encoder_fwd_buf.data(), encoder_fwd_buf.capacity(), 8);
														
 
															     char result_str[4096];
														
 
															-    static std::vector<uint8_t> encoder_buf(ctx_size_gb * 1024LL * 1024LL * 1024LL);
														
 
															     std::string input;
														
 
															     bool interactive = params.files.size() == 0;
														
@@ -181,14 +182,18 @@ int main(int argc, char ** argv) {
 
															         // Reset the ggml_context
														
 
															         model.ctx = ctx_from_buffer(encoder_buf);
														
 
															+        ggml_set_no_alloc(model.ctx, false);
														
 
															         ggml_tensor* seqs = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, info.frames, info.channels);
														
 
															+        ggml_set_no_alloc(model.ctx, true);
														
 
															         // Load audio input
														
 
															         sf_readf_float(sndfile, (float*)seqs->data, info.frames);
														
 
															         // Audio encoder
														
 
															         ggml_cgraph* gf = unity_speech_encoder(model, seqs);
														
 
															+        ggml_allocr_alloc_graph(fwd_alloc, gf);
														
 
															         ggml_graph_compute_with_ctx(model.ctx, gf, params.n_threads);
														
 
															+        // encoder_output is valid until we call `ggml_allocr_reset(fwd_alloc)`
														
 
															         ggml_tensor* encoder_output = gf->nodes[gf->n_nodes - 1];
														
 
															         // Beam search decoding
														
@@ -201,6 +206,7 @@ int main(int argc, char ** argv) {
 
															         int n = fairseq2_spm_detokenize(&model, tokens, (char*)&result_str);
														
 
															         std::cout << std::string((char*)&result_str, n) << std::endl;
														
 
															         ggml_free(model.ctx);
														
 
															+        ggml_allocr_reset(fwd_alloc);
														
 
															     }
														
 
															     return 0;
														
--- a/ggml/include/ggml/ggml.h
+++ b/ggml/include/ggml/ggml.h
@@ -214,7 +214,7 @@
 
															 #define GGML_QNT_VERSION_FACTOR 1000 // do not change this
														
 
															 #define GGML_MAX_DIMS          4
														
 
															-#define GGML_MAX_NODES         8192
														
 
															+#define GGML_MAX_NODES         4096
														
 
															 #define GGML_MAX_PARAMS        256
														
 
															 #define GGML_MAX_CONTEXTS      64
														
 
															 #define GGML_MAX_SRC           6
														
@@ -530,7 +530,7 @@ extern "C" {
 
															     // next prime after GGML_MAX_NODES
														
 
															     // #define GGML_GRAPH_HASHTABLE_SIZE 4099
														
 
															     // next prime after GGML_MAX_NODES * 2 (nodes + leafs)
														
 
															-    #define GGML_GRAPH_HASHTABLE_SIZE 16411
														
 
															+    #define GGML_GRAPH_HASHTABLE_SIZE 8273
														
 
															     // computation graph
														
 
															     struct ggml_cgraph {