1 year ago · 0d2105b62a
--- a/ggml/examples/unity/fairseq2.cpp
+++ b/ggml/examples/unity/fairseq2.cpp
@@ -23,6 +23,7 @@ ggml_tensor* ggml_detach(ggml_tensor* a) {
 
				 // when we read garbage data.
			
 
				 // It also prints memory usage information, which is useful to
			
 
				 #define DEBUG_MEM_USAGE DEBUG
			
 
				+size_t MB = 1024 * 1024;
			
 
				 
			
 
				 void printf_mem_usage(ggml_context* ctx, std::string name) {
			
 
				 #if DEBUG_MEM_USAGE
			
@@ -1367,11 +1368,13 @@ extern "C" Hypothesis* generate_sequence(
 
				     // like encoder kv cache.
			
 
				     // * step_alloc contains buffer for the forward pass of the model.
			
 
				     // TODO: the size allocated should depend on the input length and vocab size
			
 
				-    std::vector<uint8_t> local_bufs[5] = {
			
 
				-        std::vector<uint8_t>(128 * 1024 * 1024),  // step_ctx
			
 
				-        std::vector<uint8_t>(128 * 1024 * 1024),  // prev_step_ctx
			
 
				-        std::vector<uint8_t>(256 * 1024 * 1024),  // search_ctx
			
 
				-        std::vector<uint8_t>(256 * 1024 * 1024),  // step_alloc
			
 
				+    // Split mem_mb into the different context we need to use.
			
 
				+    int mem_mb = job.opts.mem_mb;
			
 
				+    std::vector<uint8_t> local_bufs[4] = {
			
 
				+        std::vector<uint8_t>(mem_mb * MB * 3 / 10),  // step_ctx
			
 
				+        std::vector<uint8_t>(mem_mb * MB * 3 / 10),  // prev_step_ctx
			
 
				+        std::vector<uint8_t>(mem_mb * MB * 3 / 10),  // search_ctx
			
 
				+        std::vector<uint8_t>(mem_mb * MB * 1 / 10),  // step_alloc
			
 
				     };
			
 
				     ggml_allocr* step_alloc = new_arena_allocr(local_bufs[3]);
			
 
				 
			
@@ -1463,7 +1466,7 @@ extern "C" Hypothesis* generate_sequence(
 
				         ggml_allocr_reset(step_alloc);
			
 
				 #if DEBUG_MEM_USAGE
			
 
				         printf("beam search step %d. Graph.n_nodes: %d.\n", step_nr, gf.n_nodes);
			
 
				-        printf("  Fwd mem: %.1fMB\n", fwd_mem/1024.0/1024.0);
			
 
				+        printf("  Fwd mem: %.1fMB, reserved %.1fMb\n", fwd_mem/(double)MB, local_bufs[3].capacity()/(double)MB);
			
 
				         std::fill(local_bufs[3].begin(), local_bufs[3].end(), 0xAA);
			
 
				 #endif
			
 
				         _tweak_lprobs(job, lprobs, step_nr, max_seq_len, vocab_size);
			
@@ -1520,14 +1523,17 @@ extern "C" Hypothesis* generate_sequence(
 
				         // Reorder beams in the `seq` and `score` buffers. The same beam can
			
 
				         // be selected more than once.
			
 
				         // (B, S), (B) -> (B, S)
			
 
				+        // don't use allocr API, cause it might reuse a kv cache buffer several time.
			
 
				+        ggml_set_no_alloc(step_ctx, false);
			
 
				         ggml_tensor* new_seqs = ggml_get_rows(step_ctx, seqs, beam_indices);
			
 
				         ggml_tensor* new_scores = ggml_get_rows(step_ctx, scores, beam_indices);
			
 
				         ggml_cgraph gf_reorder = ggml_build_forward(new_seqs);
			
 
				         ggml_build_forward_expand(&gf_reorder, new_scores);
			
 
				-        reorder_kv_cache(model, step_ctx, &gf_reorder, beam_indices);
			
 
				+        reorder_kv_cache(model, step_ctx, beam_indices, n_threads);
			
 
				         ggml_graph_compute_with_ctx(step_ctx, &gf_reorder, n_threads);
			
 
				         seqs = ggml_detach(new_seqs);
			
 
				         scores = ggml_detach(new_scores);
			
 
				+        // reorder_kv_cache(model, step_ctx, beam_indices, n_threads);
			
 
				 
			
 
				         // seqs[:, step_nr + 1] = next_tokens
			
 
				         // scores[:, step_nr + 1] = next_scores
			
@@ -1536,7 +1542,8 @@ extern "C" Hypothesis* generate_sequence(
 
				             ((float*)scores->data)[step_nr + 1 + i * max_seq_len] = ggml_get_f32_1d(next_scores, i);
			
 
				         }
			
 
				 
			
 
				-        printf_mem_usage(step_ctx, "step_ctx");
			
 
				+        printf_mem_usage(step_ctx, "  step_ctx");
			
 
				+        printf_mem_usage(search_ctx, "  search_ctx");
			
 
				         ggml_free(prev_step_ctx);
			
 
				         prev_step_ctx = step_ctx;
			
 
				 #if DEBUG_MEM_USAGE
			
--- a/ggml/examples/unity/unity.cpp
+++ b/ggml/examples/unity/unity.cpp
@@ -34,9 +34,8 @@ struct unity_params {
 
				         /*len_penalty*/ 1.0,
			
 
				         /*unk_penalty*/ 0.0,
			
 
				         /*normalize_scores*/ true,
			
 
				-        /*mem_mb*/ 256,
			
 
				+        /*mem_mb*/ 512,
			
 
				     };
			
 
				-    int32_t mem_mb = 256; // mem_usage
			
 
				 };
			
 
				 
			
 
				 
			
@@ -50,7 +49,7 @@ void unity_print_usage(int /*argc*/, char ** argv, const unity_params & params)
 
				     fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
			
 
				     fprintf(stderr, "  --text                text output\n");
			
 
				     fprintf(stderr, "  --beam-size           beam size (default: %d)\n", params.opts.beam_size);
			
 
				-    fprintf(stderr, "  -M, --mem             memory buffer, increase for long inputs (default: %d)\n", params.mem_mb);
			
 
				+    fprintf(stderr, "  -M, --mem             memory buffer, increase for long inputs (default: %d)\n", params.opts.mem_mb);
			
 
				     fprintf(stderr, "\n");
			
 
				 }
			
 
				 
			
@@ -81,7 +80,7 @@ bool unity_params_parse(int argc, char ** argv, unity_params & params) {
 
				         } else if (arg == "-b" || arg == "--beam-size") {
			
 
				             params.opts.beam_size = std::stoi(get_next_arg(i, argc, argv, arg, params));
			
 
				         } else if (arg == "-M" || arg == "--mem") {
			
 
				-            params.mem_mb = std::stoi(get_next_arg(i, argc, argv, arg, params));
			
 
				+            params.opts.mem_mb = std::stoi(get_next_arg(i, argc, argv, arg, params));
			
 
				         } else {
			
 
				             params.files.push_back(std::string(arg));
			
 
				         }
			
@@ -141,9 +140,9 @@ int main(int argc, char ** argv) {
 
				     }
			
 
				 
			
 
				     // The ctx_size_mb mostly depends of input length and model dim.
			
 
				-    int ctx_size_mb = params.mem_mb;
			
 
				-    auto encoder_buf = std::vector<uint8_t>(128 * 1024 * 1024);
			
 
				-    auto encoder_fwd_buf = std::vector<uint8_t>(ctx_size_mb * 1024 * 1024);
			
 
				+    int ctx_size_mb = params.opts.mem_mb;
			
 
				+    auto encoder_buf = std::vector<uint8_t>(8 * 1024 * 1024); // Only tensor metadata goes in there
			
 
				+    auto encoder_fwd_buf = std::vector<uint8_t>(ctx_size_mb * 1024 * 1024 / 8);
			
 
				     ggml_allocr* fwd_alloc = ggml_allocr_new(encoder_fwd_buf.data(), encoder_fwd_buf.capacity(), 8);
			
 
				     char result_str[4096];
			
 
				 
			
@@ -187,16 +186,18 @@ int main(int argc, char ** argv) {
 
				 
			
 
				         // Reset the ggml_context
			
 
				         model.ctx = ctx_from_buffer(encoder_buf);
			
 
				-        ggml_set_no_alloc(model.ctx, false);
			
 
				-        ggml_tensor* seqs = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, info.frames, info.channels);
			
 
				         ggml_set_no_alloc(model.ctx, true);
			
 
				+        GGML_ASSERT(info.samplerate == 16000);
			
 
				+        GGML_ASSERT(info.channels == 1);
			
 
				+        ggml_tensor* seqs = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, info.frames, info.channels);
			
 
				+        ggml_allocr_alloc(fwd_alloc, seqs);
			
 
				 
			
 
				         // Load audio input
			
 
				         sf_readf_float(sndfile, (float*)seqs->data, info.frames);
			
 
				 
			
 
				         // Audio encoder
			
 
				         ggml_cgraph* gf = unity_speech_encoder(model, seqs);
			
 
				-        ggml_allocr_alloc_graph(fwd_alloc, gf);
			
 
				+        size_t enc_mem_used = ggml_allocr_alloc_graph(fwd_alloc, gf);
			
 
				         ggml_graph_compute_with_ctx(model.ctx, gf, params.n_threads);
			
 
				         // encoder_output is valid until we call `ggml_allocr_reset(fwd_alloc)`
			
 
				         ggml_tensor* encoder_output = gf->nodes[gf->n_nodes - 1];