Parcourir la source

allocr for encoder (#256)

# Conflicts:
#	ggml/examples/unity/unity.cpp
Guillaume Wenzek il y a 1 an
Parent
commit
07f9a736fe
2 fichiers modifiés avec 14 ajouts et 8 suppressions
  1. 12 6
      ggml/examples/unity/unity.cpp
  2. 2 2
      ggml/include/ggml/ggml.h

+ 12 - 6
ggml/examples/unity/unity.cpp

@@ -17,6 +17,7 @@
 #include <iostream>
 #include <sndfile.h>
 #include <cstdlib>
+#include "ggml-alloc.h"
 
 struct unity_params {
     int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
@@ -111,7 +112,7 @@ Hypothesis* unity_decode(
         /*eos_idx*/model.vocab.token_to_id["</s>"],
         /*num_threads*/n_threads,
     };
-    struct ggml_tensor * prefix_seq = ggml_new_tensor_1d(model.ctx, GGML_TYPE_I32, 2);
+    FORCE_ALLOC(prefix_seq, model.ctx, ggml_new_tensor_1d(model.ctx, GGML_TYPE_I32, 2));
     ((int *)prefix_seq->data)[0]  = job.eos_idx;
     ((int *)prefix_seq->data)[1]  = tgt_lang_idx;
     job.prefix_seq = prefix_seq;
@@ -133,13 +134,13 @@ int main(int argc, char ** argv) {
         fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
         return 1;
     }
-    int ctx_size_gb = 20;
-    if (model.hparams["w2v2_encoder_config__num_encoder_layers"] == 24) {
-        ctx_size_gb = 40;
-    } 
 
+    // The ctx_size_mb mostly depends of input length and model dim.
+    int ctx_size_mb = 128;
+    auto encoder_buf = std::vector<uint8_t>(128 * 1024 * 1024);
+    auto encoder_fwd_buf = std::vector<uint8_t>(ctx_size_mb * 1024 * 1024);
+    ggml_allocr* fwd_alloc = ggml_allocr_new(encoder_fwd_buf.data(), encoder_fwd_buf.capacity(), 8);
     char result_str[4096];
-    static std::vector<uint8_t> encoder_buf(ctx_size_gb * 1024LL * 1024LL * 1024LL);
 
     std::string input;
     bool interactive = params.files.size() == 0;
@@ -181,14 +182,18 @@ int main(int argc, char ** argv) {
 
         // Reset the ggml_context
         model.ctx = ctx_from_buffer(encoder_buf);
+        ggml_set_no_alloc(model.ctx, false);
         ggml_tensor* seqs = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, info.frames, info.channels);
+        ggml_set_no_alloc(model.ctx, true);
 
         // Load audio input
         sf_readf_float(sndfile, (float*)seqs->data, info.frames);
 
         // Audio encoder
         ggml_cgraph* gf = unity_speech_encoder(model, seqs);
+        ggml_allocr_alloc_graph(fwd_alloc, gf);
         ggml_graph_compute_with_ctx(model.ctx, gf, params.n_threads);
+        // encoder_output is valid until we call `ggml_allocr_reset(fwd_alloc)`
         ggml_tensor* encoder_output = gf->nodes[gf->n_nodes - 1];
 
         // Beam search decoding
@@ -201,6 +206,7 @@ int main(int argc, char ** argv) {
         int n = fairseq2_spm_detokenize(&model, tokens, (char*)&result_str);
         std::cout << std::string((char*)&result_str, n) << std::endl;
         ggml_free(model.ctx);
+        ggml_allocr_reset(fwd_alloc);
     }
 
     return 0;

+ 2 - 2
ggml/include/ggml/ggml.h

@@ -214,7 +214,7 @@
 #define GGML_QNT_VERSION_FACTOR 1000 // do not change this
 
 #define GGML_MAX_DIMS          4
-#define GGML_MAX_NODES         8192
+#define GGML_MAX_NODES         4096
 #define GGML_MAX_PARAMS        256
 #define GGML_MAX_CONTEXTS      64
 #define GGML_MAX_SRC           6
@@ -530,7 +530,7 @@ extern "C" {
     // next prime after GGML_MAX_NODES
     // #define GGML_GRAPH_HASHTABLE_SIZE 4099
     // next prime after GGML_MAX_NODES * 2 (nodes + leafs)
-    #define GGML_GRAPH_HASHTABLE_SIZE 16411
+    #define GGML_GRAPH_HASHTABLE_SIZE 8273
 
     // computation graph
     struct ggml_cgraph {