1 år sedan · dd67e71317
--- a/ggml/examples/unity/fairseq2.cpp
+++ b/ggml/examples/unity/fairseq2.cpp
@@ -23,12 +23,13 @@ ggml_tensor* ggml_detach(ggml_tensor* a) {
 
				 // when we read garbage data.
			
 
				 // It also prints memory usage information, which is useful to
			
 
				 #define DEBUG_MEM_USAGE DEBUG
			
 
				+size_t MB = 1024 * 1024;
			
 
				 
			
 
				 void printf_mem_usage(ggml_context* ctx, std::string name) {
			
 
				 #if DEBUG_MEM_USAGE
			
 
				     double mb = 1024.0 * 1024.0;
			
 
				     printf(
			
 
				-        "ctx %s: memory used = %8.2f MB, memory reserved = %8.2f Mb\n",
			
 
				+        "%s: memory used = %8.2f MB, memory reserved = %8.2f Mb\n",
			
 
				         name.c_str(),
			
 
				         ggml_used_mem(ctx) / mb,
			
 
				         ggml_get_mem_size(ctx) / mb
			
@@ -107,6 +108,9 @@ void append_to_prev_kv(const fairseq2_model& model, const std::string& prefix, g
 
				     KeyValueTensor& kv = model.kv_cache[prefix];
			
 
				     int step_nr = kv.step_nr;
			
 
				     ggml_context* ctx = model.kv_cache_ctx ? model.kv_cache_ctx : model.ctx;
			
 
				+    // We need to force allocation here, otherwise the kv_cache buffers can be reused
			
 
				+    bool no_alloc_save = ggml_get_no_alloc(ctx);
			
 
				+    ggml_set_no_alloc(ctx, false);
			
 
				     int n_steps = (*k)->ne[1];
			
 
				     int k_proj, batch_size;
			
 
				 
			
@@ -135,15 +139,15 @@ void append_to_prev_kv(const fairseq2_model& model, const std::string& prefix, g
 
				 
			
 
				     // qk is (B * H, Sq, Sk) == (B*H, 1, Sk) in incremental mode
			
 
				     // we return the Sq slice of the (Sq, Sk) attention mask
			
 
				-    *self_attn_mask = ggml_slice(
			
 
				-        model.ctx,
			
 
				-        ggml_slice(model.ctx, kv.self_attn_mask, 0, 0, step_nr),
			
 
				-        1,
			
 
				-        step_nr - 1,
			
 
				-        step_nr
			
 
				-    );
			
 
				+    if (self_attn_mask != nullptr) {
			
 
				+        *self_attn_mask = ggml_slice(
			
 
				+            ctx, ggml_slice(ctx, kv.self_attn_mask, 0, 0, step_nr),
			
 
				+            1, step_nr - 1, step_nr
			
 
				+        );
			
 
				+    }
			
 
				 
			
 
				     kv.step_nr = step_nr;
			
 
				+    ggml_set_no_alloc(ctx, no_alloc_save);
			
 
				 }
			
 
				 
			
 
				 // variant of ggml_get_rows that allows for a with more than 2 dims.
			
@@ -636,32 +640,19 @@ extern "C" ggml_tensor* RelativePositionMHA_forward(
 
				     // we store the results (fixed) in checkpoint as model.audio_enc_pos_enc_w and load directly.
			
 
				     ggml_tensor* r = ggml_get_rows(ctx, model.tensors["speech_encoder.pos_enc"], rows);
			
 
				     r = mul_mat(ctx, model.tensors[prefix + ".sdpa.r_proj.weight"], r);
			
 
				-    r = ggml_dup(ctx, ggml_permute(ctx,
			
 
				-                        ggml_cpy(ctx,
			
 
				-                            r,
			
 
				-                            ggml_new_tensor_3d(ctx, GGML_TYPE_F32, K_h, H, S*2-1)),
			
 
				-                        0, 2, 1, 3));
			
 
				+    r = ggml_dup(ctx, ggml_permute(ctx, ggml_unflatten_1d(ctx, r, 0, K_h), 0, 2, 1, 3));
			
 
				 
			
 
				     ggml_tensor* u_bias = ggml_reshape_3d(ctx, model.tensors[prefix + ".sdpa.u_bias"], K_h, 1, H);
			
 
				     ggml_tensor* v_bias = ggml_reshape_3d(ctx, model.tensors[prefix + ".sdpa.v_bias"], K_h, 1, H);
			
 
				 
			
 
				     // self_attn: Permute QKV
			
 
				 
			
 
				-    ggml_tensor* Q = ggml_cont(ctx, ggml_permute(ctx,
			
 
				-                        ggml_cpy(ctx,
			
 
				-                            Qcur,
			
 
				-                            ggml_new_tensor_3d(ctx, GGML_TYPE_F32, K_h, H, S)),
			
 
				-                        0, 2, 1, 3)); // (H * K_h, S) -> (K_h, H, S) -> (K_h, S, H)
			
 
				-    ggml_tensor* K = ggml_cont(ctx, ggml_permute(ctx,
			
 
				-                        ggml_cpy(ctx,
			
 
				-                            Kcur,
			
 
				-                            ggml_new_tensor_3d(ctx, GGML_TYPE_F32, K_h, H, S)),
			
 
				-                        0, 2, 1, 3)); // (H * K_h, S) -> (K_h, H, S) -> (K_h, S, H)
			
 
				-    ggml_tensor* V = ggml_cont(ctx, ggml_permute(ctx,
			
 
				-                        ggml_cpy(ctx,
			
 
				-                            Vcur,
			
 
				-                            ggml_new_tensor_3d(ctx, GGML_TYPE_F32, K_h, H, S)),
			
 
				-                        1, 2, 0, 3)); // (H * K_h, S) -> (K_h, H, S) -> (H, S, K_h)
			
 
				+    // (H * K_h, S) -> (K_h, H, S) -> (K_h, S, H)
			
 
				+    ggml_tensor* Q = ggml_cont(ctx, ggml_permute(ctx, ggml_unflatten_1d(ctx, Qcur, 0, K_h), 0, 2, 1, 3));
			
 
				+    // (H * K_h, S) -> (K_h, H, S) -> (K_h, S, H)
			
 
				+    ggml_tensor* K = ggml_cont(ctx, ggml_permute(ctx, ggml_unflatten_1d(ctx, Kcur, 0, K_h), 0, 2, 1, 3));
			
 
				+    // (H * K_h, S) -> (K_h, H, S) -> (H, S, K_h)
			
 
				+    ggml_tensor* V = ggml_cont(ctx, ggml_permute(ctx, ggml_unflatten_1d(ctx, Vcur, 0, K_h), 1, 2, 0, 3));
			
 
				 
			
 
				 
			
 
				     ggml_tensor* q_with_u_bias = ggml_add_inplace(ctx, ggml_dup(ctx, Q), u_bias); // (K_h, S, H)
			
@@ -670,7 +661,6 @@ extern "C" ggml_tensor* RelativePositionMHA_forward(
 
				     ggml_tensor* ac = mul_mat(ctx, K, q_with_u_bias);
			
 
				     ggml_tensor* bd = mul_mat(ctx, r, q_with_v_bias);
			
 
				 
			
 
				-
			
 
				     // self_attn: shift_bd. Logic follows https://github.com/facebookresearch/fairseq2/blob/main/src/fairseq2/nn/transformer/relative_attention.py#L161
			
 
				     bd = ggml_dup(ctx, ggml_permute(ctx, bd, 2, 1, 0, 3)); // H, S, 2S-1
			
 
				 
			
@@ -1366,12 +1356,13 @@ extern "C" Hypothesis* generate_sequence(
 
				     // * search_ctx contains tensors that should live for the full search,
			
 
				     // like encoder kv cache.
			
 
				     // * step_alloc contains buffer for the forward pass of the model.
			
 
				-    // TODO: the size allocated should depend on the input length and vocab size
			
 
				-    std::vector<uint8_t> local_bufs[5] = {
			
 
				-        std::vector<uint8_t>(128 * 1024 * 1024),  // step_ctx
			
 
				-        std::vector<uint8_t>(128 * 1024 * 1024),  // prev_step_ctx
			
 
				-        std::vector<uint8_t>(256 * 1024 * 1024),  // search_ctx
			
 
				-        std::vector<uint8_t>(256 * 1024 * 1024),  // step_alloc
			
 
				+    // Split mem_mb into the different context we need to use.
			
 
				+    int mem_mb = job.opts.mem_mb;
			
 
				+    std::vector<uint8_t> local_bufs[4] = {
			
 
				+        std::vector<uint8_t>(mem_mb * MB * 3 / 10),  // step_ctx
			
 
				+        std::vector<uint8_t>(mem_mb * MB * 3 / 10),  // prev_step_ctx
			
 
				+        std::vector<uint8_t>(mem_mb * MB * 3 / 10),  // search_ctx
			
 
				+        std::vector<uint8_t>(mem_mb * MB * 1 / 10),  // step_alloc
			
 
				     };
			
 
				     ggml_allocr* step_alloc = new_arena_allocr(local_bufs[3]);
			
 
				 
			
@@ -1418,6 +1409,8 @@ extern "C" Hypothesis* generate_sequence(
 
				     _bootstrap_seqs_and_scores(
			
 
				         model, job, seqs, scores, encoder_output, encoder_padding_mask, n_threads
			
 
				     );
			
 
				+    // Now we will only add self_attn.k_cache and those need to be resorted and copied at every step.
			
 
				+    model.kv_cache_ctx = nullptr;
			
 
				 
			
 
				     // Holds the indices of beams (a beam can occur more than once) that we
			
 
				     // should continue with in the next step.
			
@@ -1463,7 +1456,7 @@ extern "C" Hypothesis* generate_sequence(
 
				         ggml_allocr_reset(step_alloc);
			
 
				 #if DEBUG_MEM_USAGE
			
 
				         printf("beam search step %d. Graph.n_nodes: %d.\n", step_nr, gf.n_nodes);
			
 
				-        printf("  Fwd mem: %.1fMB\n", fwd_mem/1024.0/1024.0);
			
 
				+        printf("  Fwd mem: %.1fMB, reserved %.1fMb\n", fwd_mem/(double)MB, local_bufs[3].capacity()/(double)MB);
			
 
				         std::fill(local_bufs[3].begin(), local_bufs[3].end(), 0xAA);
			
 
				 #endif
			
 
				         _tweak_lprobs(job, lprobs, step_nr, max_seq_len, vocab_size);
			
@@ -1520,6 +1513,8 @@ extern "C" Hypothesis* generate_sequence(
 
				         // Reorder beams in the `seq` and `score` buffers. The same beam can
			
 
				         // be selected more than once.
			
 
				         // (B, S), (B) -> (B, S)
			
 
				+        // don't use allocr API, cause it might reuse a kv cache buffer several time.
			
 
				+        ggml_set_no_alloc(step_ctx, false);
			
 
				         ggml_tensor* new_seqs = ggml_get_rows(step_ctx, seqs, beam_indices);
			
 
				         ggml_tensor* new_scores = ggml_get_rows(step_ctx, scores, beam_indices);
			
 
				         ggml_cgraph gf_reorder = ggml_build_forward(new_seqs);
			
@@ -1536,7 +1531,7 @@ extern "C" Hypothesis* generate_sequence(
 
				             ((float*)scores->data)[step_nr + 1 + i * max_seq_len] = ggml_get_f32_1d(next_scores, i);
			
 
				         }
			
 
				 
			
 
				-        printf_mem_usage(step_ctx, "step_ctx");
			
 
				+        printf_mem_usage(step_ctx, "  step_ctx");
			
 
				         ggml_free(prev_step_ctx);
			
 
				         prev_step_ctx = step_ctx;
			
 
				 #if DEBUG_MEM_USAGE
			
@@ -1553,6 +1548,7 @@ end_of_beam_search:
 
				         [](Hypothesis a, Hypothesis b) { return a.score > b.score; }
			
 
				     );
			
 
				 
			
 
				+    printf_mem_usage(search_ctx, "search_ctx");
			
 
				     fairseq2_kv_cache_reset(model);
			
 
				     model.ctx = original_ctx;
			
 
				     return finished_searches_begin;
			
--- a/ggml/examples/unity/fairseq2.h
+++ b/ggml/examples/unity/fairseq2.h
@@ -276,6 +276,9 @@ struct SequenceGeneratorOptions {
 
				 
			
 
				     /// If ``True``, normalizes scores by the length of generated sequences.
			
 
				     bool normalize_scores = true;
			
 
				+
			
 
				+    // memory needed is largely a fn of model size + sentence length and beam_size
			
 
				+    int mem_mb = 256;
			
 
				 };
			
 
				 
			
 
				 
			
--- a/ggml/examples/unity/model_loader.cpp
+++ b/ggml/examples/unity/model_loader.cpp
@@ -47,7 +47,7 @@ model_loader::load_model_weights(fairseq2_model &model, std::ifstream &fin)
 
				     // Note this require changing the on disk format
			
 
				     bool as_float32 = true;
			
 
				     struct ggml_init_params params = {
			
 
				-        /*.mem_size   =*/ f32_tensor_size + num_tensor * (int64_t)ggml_tensor_overhead(),
			
 
				+        /*.mem_size   =*/ f32_tensor_size + (num_tensor + 1) * (int64_t)ggml_tensor_overhead(),
			
 
				         /*.mem_buffer =*/ NULL,
			
 
				         /*.no_alloc   =*/ false,
			
 
				     };
			
--- a/ggml/examples/unity/unity.cpp
+++ b/ggml/examples/unity/unity.cpp
@@ -34,7 +34,9 @@ struct unity_params {
 
				         /*len_penalty*/ 1.0,
			
 
				         /*unk_penalty*/ 0.0,
			
 
				         /*normalize_scores*/ true,
			
 
				+        /*mem_mb*/ 512,
			
 
				     };
			
 
				+    int32_t max_audio_s = 30;
			
 
				 };
			
 
				 
			
 
				 
			
@@ -48,6 +50,8 @@ void unity_print_usage(int /*argc*/, char ** argv, const unity_params & params)
 
				     fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
			
 
				     fprintf(stderr, "  --text                text output\n");
			
 
				     fprintf(stderr, "  --beam-size           beam size (default: %d)\n", params.opts.beam_size);
			
 
				+    fprintf(stderr, "  -M, --mem             memory buffer, increase for long inputs (default: %d)\n", params.opts.mem_mb);
			
 
				+    fprintf(stderr, "  --max-audio           max duration of audio in seconds (default: %d)\n", params.max_audio_s);
			
 
				     fprintf(stderr, "\n");
			
 
				 }
			
 
				 
			
@@ -77,6 +81,10 @@ bool unity_params_parse(int argc, char ** argv, unity_params & params) {
 
				             params.text = true;
			
 
				         } else if (arg == "-b" || arg == "--beam-size") {
			
 
				             params.opts.beam_size = std::stoi(get_next_arg(i, argc, argv, arg, params));
			
 
				+        } else if (arg == "-M" || arg == "--mem") {
			
 
				+            params.opts.mem_mb = std::stoi(get_next_arg(i, argc, argv, arg, params));
			
 
				+        } else if (arg == "--max-audio") {
			
 
				+            params.max_audio_s = std::stoi(get_next_arg(i, argc, argv, arg, params));
			
 
				         } else {
			
 
				             params.files.push_back(std::string(arg));
			
 
				         }
			
@@ -136,9 +144,9 @@ int main(int argc, char ** argv) {
 
				     }
			
 
				 
			
 
				     // The ctx_size_mb mostly depends of input length and model dim.
			
 
				-    int ctx_size_mb = 128;
			
 
				-    auto encoder_buf = std::vector<uint8_t>(128 * 1024 * 1024);
			
 
				-    auto encoder_fwd_buf = std::vector<uint8_t>(ctx_size_mb * 1024 * 1024);
			
 
				+    int ctx_size_mb = params.opts.mem_mb;
			
 
				+    auto encoder_buf = std::vector<uint8_t>(8 * 1024 * 1024); // Only tensor metadata goes in there
			
 
				+    auto encoder_fwd_buf = std::vector<uint8_t>(ctx_size_mb * 1024 * 1024 / 2);
			
 
				     ggml_allocr* fwd_alloc = ggml_allocr_new(encoder_fwd_buf.data(), encoder_fwd_buf.capacity(), 8);
			
 
				     char result_str[4096];
			
 
				 
			
@@ -182,16 +190,20 @@ int main(int argc, char ** argv) {
 
				 
			
 
				         // Reset the ggml_context
			
 
				         model.ctx = ctx_from_buffer(encoder_buf);
			
 
				-        ggml_set_no_alloc(model.ctx, false);
			
 
				-        ggml_tensor* seqs = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, info.frames, info.channels);
			
 
				         ggml_set_no_alloc(model.ctx, true);
			
 
				+        GGML_ASSERT(info.samplerate == 16000);
			
 
				+        GGML_ASSERT(info.channels == 1);
			
 
				+        // Truncate audio input. Ideally we should chunk it, but this will prevent most obvious OOM.
			
 
				+        int n_frames = std::min(info.samplerate * params.max_audio_s, (int)info.frames);
			
 
				+        ggml_tensor* seqs = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, n_frames, info.channels);
			
 
				+        ggml_allocr_alloc(fwd_alloc, seqs);
			
 
				 
			
 
				         // Load audio input
			
 
				-        sf_readf_float(sndfile, (float*)seqs->data, info.frames);
			
 
				+        sf_readf_float(sndfile, (float*)seqs->data, n_frames);
			
 
				 
			
 
				         // Audio encoder
			
 
				         ggml_cgraph* gf = unity_speech_encoder(model, seqs);
			
 
				-        ggml_allocr_alloc_graph(fwd_alloc, gf);
			
 
				+        size_t enc_mem_used = ggml_allocr_alloc_graph(fwd_alloc, gf);
			
 
				         ggml_graph_compute_with_ctx(model.ctx, gf, params.n_threads);
			
 
				         // encoder_output is valid until we call `ggml_allocr_reset(fwd_alloc)`
			
 
				         ggml_tensor* encoder_output = gf->nodes[gf->n_nodes - 1];
			
--- a/ggml/ggml.py
+++ b/ggml/ggml.py
@@ -459,7 +459,7 @@ class SequenceGeneratorOptions:
 
				     len_penalty: float = 1.0
			
 
				     unk_penalty: float = 0.0
			
 
				     normalize_scores: bool = True
			
 
				-
			
 
				+    mem_mb: int = 256
			
 
				 
			
 
				 @c_struct
			
 
				 @dataclasses.dataclass
			
--- a/ggml/test_unity_cpp.py
+++ b/ggml/test_unity_cpp.py
@@ -97,7 +97,7 @@ def download_sample_audio() -> Any:
 
				 def test_convert_linear(tmp_path: Path) -> None:
			
 
				     module = fairseq2.nn.Linear(16, 24, True)
			
 
				 
			
 
				-    layer_config = read_layer_config(module)
			
 
				+    layer_config = read_layer_config(module, "")
			
 
				     assert layer_config == {"input_dim": 16, "output_dim": 24}
			
 
				 
			
 
				     module_file = tmp_path / "module.ggml"
			
@@ -112,7 +112,7 @@ def test_convert_linear(tmp_path: Path) -> None:
 
				 def test_convert_linear_fp16(tmp_path: Path, ctx: Ctx) -> None:
			
 
				     pt_model = torch.nn.ModuleDict({"linear": fairseq2.nn.Linear(16, 24, True)})
			
 
				 
			
 
				-    layer_config = read_layer_config(pt_model)
			
 
				+    layer_config = read_layer_config(pt_model, "")
			
 
				     assert layer_config == {"linear.input_dim": 16, "linear.output_dim": 24}
			
 
				 
			
 
				     ggml_file = tmp_path / "linear.ggml"