1 tahun lalu · 31f2419086
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -160,9 +160,6 @@ target_include_directories(kaldi-native-fbank PUBLIC
 
															   ${CMAKE_CURRENT_SOURCE_DIR}/examples/kaldi-native-fbank/csrc
														
 
															 )
														
 
															-option( TRACY_ENABLE "" ON)
														
 
															-option( TRACY_ON_DEMAND "" ON)
														
 
															-add_subdirectory (tracy)
														
 
															 if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
														
 
															     set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
														
--- a/ggml/examples/common.cpp
+++ b/ggml/examples/common.cpp
@@ -806,4 +806,4 @@ void sam_print_usage(int /*argc*/, char ** argv, const sam_params & params) {
 
															     fprintf(stderr, "  -o FNAME, --out FNAME\n");
														
 
															     fprintf(stderr, "                        output file (default: %s)\n", params.fname_out.c_str());
														
 
															     fprintf(stderr, "\n");
														
 
															-}
														
 
															+}
														
--- a/ggml/examples/common.h
+++ b/ggml/examples/common.h
@@ -179,4 +179,4 @@ struct sam_params {
 
															 bool sam_params_parse(int argc, char ** argv, sam_params & params);
														
 
															-void sam_print_usage(int argc, char ** argv, const sam_params & params);
														
 
															+void sam_print_usage(int argc, char ** argv, const sam_params & params);
														
--- a/ggml/examples/unity/fairseq2.cpp
+++ b/ggml/examples/unity/fairseq2.cpp
@@ -7,8 +7,6 @@
 
															 #include "kaldi-native-fbank/csrc/feature-fbank.h"
														
 
															 #include "kaldi-native-fbank/csrc/feature-window.h"
														
 
															-#include "tracy/Tracy.hpp"
														
 
															-
														
 
															 #include "fairseq2.h"
														
 
															 #include "ggml.h"
														
@@ -18,7 +16,7 @@ ggml_tensor* ggml_detach(ggml_tensor* a) {
 
															     return a;
														
 
															 }
														
 
															-#define DEBUG_MEM_USAGE 1
														
 
															+#define DEBUG_MEM_USAGE 0
														
 
															 void printf_mem_usage(ggml_context* ctx, std::string name) {
														
 
															 #if DEBUG_MEM_USAGE
														
@@ -1147,7 +1145,6 @@ extern "C" void _bootstrap_seqs_and_scores(
 
															     ggml_tensor* encoder_output,
														
 
															     ggml_tensor* encoder_padding_mask
														
 
															 ) {
														
 
															-    ZoneScoped;
														
 
															     int prefix_seq_len = job.prefix_seq->ne[0];
														
 
															     int max_seq_len = scores->ne[0];
														
 
															     int beam_size = scores->ne[1];
														
@@ -1188,6 +1185,7 @@ extern "C" void _bootstrap_seqs_and_scores(
 
															     ggml_cgraph gf = ggml_build_forward(lprobs);
														
 
															     ggml_graph_compute_with_ctx(ctx, &gf, 1);
														
 
															+    ggml_free(ctx);
														
 
															     full_seqs->type = GGML_TYPE_I32;
														
 
															     job.prefix_seq->type = GGML_TYPE_I32;
														
@@ -1210,8 +1208,7 @@ int topk(
 
															     std::int64_t k,
														
 
															     ggml_tensor* candidate_indices
														
 
															 ) {
														
 
															-    ZoneNamed(topk, true);
														
 
															-    // Take the best 2 x `beam_size` predictions. We'll choose the first
														
 
															+        // Take the best 2 x `beam_size` predictions. We'll choose the first
														
 
															     // `beam_size` of these which don't predict EOS to continue with.
														
 
															     // (N, 2 x B)
														
 
															     // `vocab_size` - 1 to never select PAD.
														
@@ -1227,8 +1224,7 @@ int topk(
 
															 }
														
 
															 void _tweak_lprobs(const SequenceGeneratorJob& job, ggml_tensor* lprobs, int step_nr, int max_seq_len, std::size_t vocab_size) {
														
 
															-    ZoneNamed(tweak_lprobs, true);
														
 
															-    std::size_t beam_size = job.opts.beam_size;
														
 
															+        std::size_t beam_size = job.opts.beam_size;
														
 
															     std::size_t eos_idx = job.eos_idx;
														
 
															     // Do not allow EOS before reaching the minimum sequence length.
														
@@ -1279,8 +1275,7 @@ void _finalize_hypothesis(
 
															     ggml_tensor* scores, // (beam_size, seq_len)
														
 
															     Hypothesis* hypothesis
														
 
															 ) {
														
 
															-    ZoneNamed(_finalize_hypothesis, true);
														
 
															-    ggml_tensor* seq = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, step_nr + 2);
														
 
															+        ggml_tensor* seq = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, step_nr + 2);
														
 
															     hypothesis->seq = seq;
														
 
															     ggml_tensor* step_scores = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, step_nr + 2);
														
 
															     hypothesis->step_scores = step_scores;
														
@@ -1331,11 +1326,10 @@ extern "C" Hypothesis* generate_sequence(
 
															     ggml_tensor* encoder_padding_mask,
														
 
															     ggml_context* result_ctx
														
 
															 ) {
														
 
															-    ZoneScoped;
														
 
															     std::vector<uint8_t> local_bufs[3] = {
														
 
															-        std::vector<uint8_t>(256 * 1024 * 1024),  // step_ctx
														
 
															-        std::vector<uint8_t>(256 * 1024 * 1024),  // next_step_ctx
														
 
															-        std::vector<uint8_t>(256 * 1024 * 1024)  // search_ctx
														
 
															+        std::vector<uint8_t>(1024 * 1024 * 1024),  // step_ctx
														
 
															+        std::vector<uint8_t>(1024 * 1024 * 1024),  // next_step_ctx
														
 
															+        std::vector<uint8_t>(1024 * 1024 * 1024)  // search_ctx
														
 
															     };
														
 
															     ggml_context* search_ctx = ctx_from_buffer(local_bufs[2]);
														
@@ -1441,7 +1435,6 @@ extern "C" Hypothesis* generate_sequence(
 
															         std::size_t ongoing_beams = 0;
														
 
															         for (std::int32_t i = 0; i < K; ++i) {
														
 
															-            ZoneNamed(beam_search_step, true);
														
 
															             int c = ggml_get_f32_1d(candidate_indices, i);
														
 
															             std::int32_t beam = c / vocab_size;
														
 
															             std::int32_t token = c % vocab_size;
														
@@ -1476,15 +1469,15 @@ extern "C" Hypothesis* generate_sequence(
 
															             new_scores = ggml_get_rows(search_ctx, scores, beam_indices);
														
 
															             ggml_cgraph gf_reorder = ggml_build_forward(new_seqs);
														
 
															             ggml_build_forward_expand(&gf_reorder, new_scores);
														
 
															-            next_step_ctx = ctx_from_buffer(local_bufs[(step_nr + 1) % 2]);
														
 
															-            reorder_kv_cache(model, next_step_ctx, &gf_reorder, beam_indices);
														
 
															-
														
 
															-            ggml_graph_compute_with_ctx(next_step_ctx, &gf_reorder, 1);
														
 
															+            reorder_kv_cache(model, step_ctx, &gf_reorder, beam_indices);
														
 
															+            ggml_graph_compute_with_ctx(step_ctx, &gf_reorder, 1);
														
 
															             ggml_detach(new_seqs);
														
 
															             ggml_detach(new_scores);
														
 
															             new_seqs->type = GGML_TYPE_I32;
														
 
															             printf_mem_usage(search_ctx, "search_ctx");
														
 
															+            next_step_ctx = ctx_from_buffer(local_bufs[(step_nr + 1) % 2]);
														
 
															             SWAP(step_ctx, next_step_ctx);
														
 
															+            ggml_free(next_step_ctx);
														
 
															         }
														
 
															         // new_seqs[:, step_nr + 1] = next_tokens
														
--- a/ggml/examples/unity/unity.cpp
+++ b/ggml/examples/unity/unity.cpp
@@ -135,7 +135,7 @@ int main(int argc, char ** argv) {
 
															     }
														
 
															     char result_str[4096];
														
 
															-    static std::vector<uint8_t> encoder_buf(4 * 1024LL * 1024LL * 1024LL);
														
 
															+    static std::vector<uint8_t> encoder_buf(20 * 1024LL * 1024LL * 1024LL);
														
 
															     std::string input;
														
 
															     bool interactive = params.files.size() == 0;
														
@@ -189,13 +189,14 @@ int main(int argc, char ** argv) {
 
															         // Beam search decoding
														
 
															         const Hypothesis* result = unity_decode(model, params.opts, tgt_lang_idx, encoder_output, params.n_threads);
														
 
															-
														
 
															+    
														
 
															         // Drop language and bos token.
														
 
															         ggml_tensor* tokens = ggml_slice(model.ctx, result[0].seq, 0, 2, 0);
														
 
															         // Collect result string
														
 
															         int n = fairseq2_spm_detokenize(&model, tokens, (char*)&result_str);
														
 
															         std::cout << std::string((char*)&result_str, n) << std::endl;
														
 
															+        ggml_free(model.ctx);
														
 
															     }
														
 
															     return 0;
														
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -261,12 +261,8 @@ target_include_directories(${TARGET} PUBLIC
 
															     ../include
														
 
															     ../include/ggml
														
 
															     ../examples/
														
 
															-    ../tracy/public/
														
 
															     ${GGML_EXTRA_INCS}
														
 
															     )
														
 
															-if (TRACY_ENABLE)
														
 
															-    target_link_libraries (${TARGET} PUBLIC Tracy::TracyClient )
														
 
															-endif()
														
 
															 if (MSVC)
														
 
															     target_link_libraries(${TARGET} PUBLIC ${GGML_EXTRA_LIBS} ${CMAKE_THREAD_LIBS_INIT} kaldi-native-fbank)
														
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -25,7 +25,6 @@
 
															 #include <limits.h>
														
 
															 #include <stdarg.h>
														
 
															 #include <signal.h>
														
 
															-#include "tracy/TracyC.h"
														
 
															 #ifdef GGML_USE_METAL
														
@@ -11692,17 +11691,11 @@ static void ggml_compute_forward_mul_mat(
 
															                     x = wdata;
														
 
															                 }
														
 
															-                TracyCPlot("cblas_sgemm_B", ne13 * ne12);
														
 
															-                TracyCPlot("cblas_sgemm_M", ne11);
														
 
															-                TracyCPlot("cblas_sgemm_N", ne01);
														
 
															-                TracyCPlot("cblas_sgemm_K", ne10);
														
 
															-                TracyCZoneN(_tracy_sgemm, "cblas_sgemm", true);
														
 
															                 cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
														
 
															                         ne11, ne01, ne10,
														
 
															                         1.0f,    y, ne10,
														
 
															                                  x, ne00,
														
 
															                         0.0f,    d, ne01);
														
 
															-                TracyCZoneEnd(_tracy_sgemm);
														
 
															             }
														
 
															         }
														
@@ -16735,472 +16728,324 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
 
															     switch (tensor->op) {
														
 
															         case GGML_OP_DUP:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_DUP", true);
														
 
															                 ggml_compute_forward_dup(params, tensor->src[0], tensor);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             } break;
														
 
															         case GGML_OP_ADD:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_ADD", true);
														
 
															                 ggml_compute_forward_add(params, tensor->src[0], tensor->src[1], tensor);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             } break;
														
 
															         case GGML_OP_ADD1:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_ADD1", true);
														
 
															                 ggml_compute_forward_add1(params, tensor->src[0], tensor->src[1], tensor);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             } break;
														
 
															         case GGML_OP_ACC:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_ACC", true);
														
 
															                 ggml_compute_forward_acc(params, tensor->src[0], tensor->src[1], tensor);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             } break;
														
 
															         case GGML_OP_SUB:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_SUB", true);
														
 
															                 ggml_compute_forward_sub(params, tensor->src[0], tensor->src[1], tensor);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             } break;
														
 
															         case GGML_OP_MUL:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_MUL", true);
														
 
															                 ggml_compute_forward_mul(params, tensor->src[0], tensor->src[1], tensor);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             } break;
														
 
															         case GGML_OP_DIV:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_DIV", true);
														
 
															                 ggml_compute_forward_div(params, tensor->src[0], tensor->src[1], tensor);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             } break;
														
 
															         case GGML_OP_SQR:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_SQR", true);
														
 
															                 ggml_compute_forward_sqr(params, tensor->src[0], tensor);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             } break;
														
 
															         case GGML_OP_SQRT:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_SQRT", true);
														
 
															                 ggml_compute_forward_sqrt(params, tensor->src[0], tensor);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             } break;
														
 
															         case GGML_OP_LOG:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_LOG", true);
														
 
															                 ggml_compute_forward_log(params, tensor->src[0], tensor);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             } break;
														
 
															         case GGML_OP_SUM:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_SUM", true);
														
 
															                 ggml_compute_forward_sum(params, tensor->src[0], tensor);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             } break;
														
 
															         case GGML_OP_SUM_ROWS:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_SUM_ROWS", true);
														
 
															                 ggml_compute_forward_sum_rows(params, tensor->src[0], tensor);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             } break;
														
 
															         case GGML_OP_MEAN:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_MEAN", true);
														
 
															                 ggml_compute_forward_mean(params, tensor->src[0], tensor);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             } break;
														
 
															         case GGML_OP_ARGMAX:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_ARGMAX", true);
														
 
															                 ggml_compute_forward_argmax(params, tensor->src[0], tensor);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             } break;
														
 
															         case GGML_OP_REPEAT:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_REPEAT", true);
														
 
															                 ggml_compute_forward_repeat(params, tensor->src[0], tensor);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             } break;
														
 
															         case GGML_OP_REPEAT_BACK:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_REPEAT_BACK", true);
														
 
															                 ggml_compute_forward_repeat_back(params, tensor->src[0], tensor);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             } break;
														
 
															         case GGML_OP_CONCAT:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_CONCAT", true);
														
 
															                 ggml_compute_forward_concat(params, tensor->src[0], tensor->src[1], tensor);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             } break;
														
 
															         case GGML_OP_SILU_BACK:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_SILU_BACK", true);
														
 
															                 ggml_compute_forward_silu_back(params, tensor->src[0], tensor->src[1], tensor);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             } break;
														
 
															         case GGML_OP_NORM:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_NORM", true);
														
 
															                 ggml_compute_forward_norm(params, tensor->src[0], tensor);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             } break;
														
 
															         case GGML_OP_BATCH_NORM:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_BATCH_NORM", true);
														
 
															                 ggml_compute_forward_batch_norm(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], tensor->src[4], tensor);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             } break;
														
 
															         case GGML_OP_RMS_NORM:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_RMS_NORM", true);
														
 
															                 ggml_compute_forward_rms_norm(params, tensor->src[0], tensor);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             } break;
														
 
															         case GGML_OP_RMS_NORM_BACK:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_RMS_NORM_BACK", true);
														
 
															                 ggml_compute_forward_rms_norm_back(params, tensor->src[0], tensor->src[1], tensor);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             } break;
														
 
															         case GGML_OP_GROUP_NORM:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_GROUP_NORM", true);
														
 
															                 ggml_compute_forward_group_norm(params, tensor->src[0], tensor);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             } break;
														
 
															         case GGML_OP_MUL_MAT:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_MUL_MAT", true);
														
 
															                 ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             } break;
														
 
															         case GGML_OP_OUT_PROD:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_OUT_PROD", true);
														
 
															                 ggml_compute_forward_out_prod(params, tensor->src[0], tensor->src[1], tensor);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             } break;
														
 
															         case GGML_OP_SCALE:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_SCALE", true);
														
 
															                 ggml_compute_forward_scale(params, tensor->src[0], tensor->src[1], tensor);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             } break;
														
 
															         case GGML_OP_SET:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_SET", true);
														
 
															                 ggml_compute_forward_set(params, tensor->src[0], tensor->src[1], tensor);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             } break;
														
 
															         case GGML_OP_CPY:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_CPY", true);
														
 
															                 ggml_compute_forward_cpy(params, tensor->src[0], tensor);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             } break;
														
 
															         case GGML_OP_CONT:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_CONT", true);
														
 
															                 ggml_compute_forward_cont(params, tensor->src[0], tensor);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             } break;
														
 
															         case GGML_OP_RESHAPE:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_RESHAPE", true);
														
 
															                 ggml_compute_forward_reshape(params, tensor->src[0], tensor);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             } break;
														
 
															         case GGML_OP_VIEW:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_VIEW", true);
														
 
															                 ggml_compute_forward_view(params, tensor->src[0]);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             } break;
														
 
															         case GGML_OP_PERMUTE:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_PERMUTE", true);
														
 
															                 ggml_compute_forward_permute(params, tensor->src[0]);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             } break;
														
 
															         case GGML_OP_TRANSPOSE:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_TRANSPOSE", true);
														
 
															                 ggml_compute_forward_transpose(params, tensor->src[0]);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             } break;
														
 
															         case GGML_OP_GET_ROWS:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_GET_ROWS", true);
														
 
															                 ggml_compute_forward_get_rows(params, tensor->src[0], tensor->src[1], tensor);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             } break;
														
 
															         case GGML_OP_GET_ROWS_BACK:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_GET_ROWS_BACK", true);
														
 
															                 ggml_compute_forward_get_rows_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             } break;
														
 
															         case GGML_OP_DIAG:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_DIAG", true);
														
 
															                 ggml_compute_forward_diag(params, tensor->src[0], tensor);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             } break;
														
 
															         case GGML_OP_DIAG_MASK_INF:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_DIAG_MASK_INF", true);
														
 
															                 ggml_compute_forward_diag_mask_inf(params, tensor->src[0], tensor);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             } break;
														
 
															         case GGML_OP_DIAG_MASK_ZERO:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_DIAG_MASK_ZERO", true);
														
 
															                 ggml_compute_forward_diag_mask_zero(params, tensor->src[0], tensor);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             } break;
														
 
															         case GGML_OP_SOFT_MAX:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_SOFT_MAX", true);
														
 
															                 ggml_compute_forward_soft_max(params, tensor->src[0], tensor);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             } break;
														
 
															         case GGML_OP_SOFT_MAX_BACK:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_SOFT_MAX_BACK", true);
														
 
															                 ggml_compute_forward_soft_max_back(params, tensor->src[0], tensor->src[1], tensor);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             } break;
														
 
															         case GGML_OP_ROPE:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_ROPE", true);
														
 
															                 ggml_compute_forward_rope(params, tensor->src[0], tensor);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             } break;
														
 
															         case GGML_OP_ROPE_BACK:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_ROPE_BACK", true);
														
 
															                 ggml_compute_forward_rope_back(params, tensor->src[0], tensor);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             } break;
														
 
															         case GGML_OP_ALIBI:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_ALIBI", true);
														
 
															                 ggml_compute_forward_alibi(params, tensor->src[0], tensor);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             } break;
														
 
															         case GGML_OP_CLAMP:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_CLAMP", true);
														
 
															                 ggml_compute_forward_clamp(params, tensor->src[0], tensor);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             } break;
														
 
															         case GGML_OP_CONV_1D:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_CONV_1D", true);
														
 
															                 ggml_compute_forward_conv_1d(params, tensor->src[0], tensor->src[1], tensor);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             } break;
														
 
															         case GGML_OP_CONV_1D_STAGE_0:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_CONV_1D_STAGE_0", true);
														
 
															                 ggml_compute_forward_conv_1d_stage_0(params, tensor->src[0], tensor->src[1], tensor);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             } break;
														
 
															         case GGML_OP_CONV_1D_STAGE_1:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_CONV_1D_STAGE_1", true);
														
 
															                 ggml_compute_forward_conv_1d_stage_1(params, tensor->src[0], tensor);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             } break;
														
 
															         case GGML_OP_CONV_1D_STAGE_2:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_CONV_1D_STAGE_2", true);
														
 
															                 ggml_compute_forward_conv_1d_stage_2(params, tensor->src[0], tensor->src[1], tensor);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             } break;
														
 
															         case GGML_OP_CONV_1D_GENERIC:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_CONV_1D_GENERIC", true);
														
 
															                 ggml_compute_forward_conv_1d_generic(params, tensor->src[0], tensor->src[1], tensor);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             } break;
														
 
															         case GGML_OP_CONV_1D_GENERIC_STAGE_0:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_CONV_1D_GENERIC_STAGE_0", true);
														
 
															                 ggml_compute_forward_conv_1d_generic_stage_0(params, tensor->src[0], tensor->src[1], tensor);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             } break;
														
 
															         case GGML_OP_CONV_1D_GENERIC_STAGE_1:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_CONV_1D_GENERIC_STAGE_1", true);
														
 
															                 ggml_compute_forward_conv_1d_generic_stage_1(params, tensor->src[0], tensor->src[1], tensor);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             } break;
														
 
															         case GGML_OP_CONV_2D:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_CONV_2D", true);
														
 
															                 ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             } break;
														
 
															         case GGML_OP_CONV_TRANSPOSE_2D:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_CONV_TRANSPOSE_2D", true);
														
 
															                 ggml_compute_forward_conv_transpose_2d(params, tensor->src[0], tensor->src[1], tensor);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             } break;
														
 
															         case GGML_OP_POOL_1D:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_POOL_1D", true);
														
 
															                 ggml_compute_forward_pool_1d(params, tensor->src[0], tensor);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             } break;
														
 
															         case GGML_OP_POOL_2D:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_POOL_2D", true);
														
 
															                 ggml_compute_forward_pool_2d(params, tensor->src[0], tensor);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             } break;
														
 
															         case GGML_OP_UPSCALE:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_UPSCALE", true);
														
 
															                 ggml_compute_forward_upscale(params, tensor->src[0], tensor);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             } break;
														
 
															         case GGML_OP_FLASH_ATTN:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_FLASH_ATTN", true);
														
 
															                 const int32_t t = ggml_get_op_params_i32(tensor, 0);
														
 
															                 GGML_ASSERT(t == 0 || t == 1);
														
 
															                 const bool masked = t != 0;
														
 
															                 ggml_compute_forward_flash_attn(params, tensor->src[0], tensor->src[1], tensor->src[2], masked, tensor);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             } break;
														
 
															         case GGML_OP_FLASH_FF:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_FLASH_FF", true);
														
 
															                 ggml_compute_forward_flash_ff(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], tensor->src[4], tensor);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             } break;
														
 
															         case GGML_OP_FLASH_ATTN_BACK:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_FLASH_ATTN_BACK", true);
														
 
															                 int32_t t = ggml_get_op_params_i32(tensor, 0);
														
 
															                 GGML_ASSERT(t == 0 || t == 1);
														
 
															                 bool masked = t != 0;
														
 
															                 ggml_compute_forward_flash_attn_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], masked, tensor);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             } break;
														
 
															         case GGML_OP_WIN_PART:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_WIN_PART", true);
														
 
															                 ggml_compute_forward_win_part(params, tensor->src[0], tensor);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             } break;
														
 
															         case GGML_OP_WIN_UNPART:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_WIN_UNPART", true);
														
 
															                 ggml_compute_forward_win_unpart(params, tensor->src[0], tensor);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             } break;
														
 
															         case GGML_OP_UNARY:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_UNARY", true);
														
 
															                 ggml_compute_forward_unary(params, tensor->src[0], tensor);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             } break;
														
 
															         case GGML_OP_GET_REL_POS:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_GET_REL_POS", true);
														
 
															                 ggml_compute_forward_get_rel_pos(params, tensor->src[0], tensor);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             } break;
														
 
															         case GGML_OP_ADD_REL_POS:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_ADD_REL_POS", true);
														
 
															                 ggml_compute_forward_add_rel_pos(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             } break;
														
 
															         case GGML_OP_MAP_UNARY:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_MAP_UNARY", true);
														
 
															                 ggml_unary_op_f32_t fun;
														
 
															                 memcpy(&fun, tensor->op_params, sizeof(fun));
														
 
															                 ggml_compute_forward_map_unary(params, tensor->src[0], tensor, fun);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             }
														
 
															             break;
														
 
															         case GGML_OP_MAP_BINARY:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_MAP_BINARY", true);
														
 
															                 ggml_binary_op_f32_t fun;
														
 
															                 memcpy(&fun, tensor->op_params, sizeof(fun));
														
 
															                 ggml_compute_forward_map_binary(params, tensor->src[0], tensor->src[1], tensor, fun);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             }
														
 
															             break;
														
 
															         case GGML_OP_MAP_CUSTOM1_F32:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_MAP_CUSTOM1_F32", true);
														
 
															                 ggml_custom1_op_f32_t fun;
														
 
															                 memcpy(&fun, tensor->op_params, sizeof(fun));
														
 
															                 ggml_compute_forward_map_custom1_f32(params, tensor->src[0], tensor, fun);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             }
														
 
															             break;
														
 
															         case GGML_OP_MAP_CUSTOM2_F32:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_MAP_CUSTOM2_F32", true);
														
 
															                 ggml_custom2_op_f32_t fun;
														
 
															                 memcpy(&fun, tensor->op_params, sizeof(fun));
														
 
															                 ggml_compute_forward_map_custom2_f32(params, tensor->src[0], tensor->src[1], tensor, fun);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             }
														
 
															             break;
														
 
															         case GGML_OP_MAP_CUSTOM3_F32:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_MAP_CUSTOM3_F32", true);
														
 
															                 ggml_custom3_op_f32_t fun;
														
 
															                 memcpy(&fun, tensor->op_params, sizeof(fun));
														
 
															                 ggml_compute_forward_map_custom3_f32(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor, fun);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             }
														
 
															             break;
														
 
															         case GGML_OP_MAP_CUSTOM1:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_MAP_CUSTOM1", true);
														
 
															                 ggml_compute_forward_map_custom1(params, tensor->src[0], tensor);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             }
														
 
															             break;
														
 
															         case GGML_OP_MAP_CUSTOM2:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_MAP_CUSTOM2", true);
														
 
															                 ggml_compute_forward_map_custom2(params, tensor->src[0], tensor->src[1], tensor);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             }
														
 
															             break;
														
 
															         case GGML_OP_MAP_CUSTOM3:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_MAP_CUSTOM3", true);
														
 
															                 ggml_compute_forward_map_custom3(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             }
														
 
															             break;
														
 
															         case GGML_OP_CROSS_ENTROPY_LOSS:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_CROSS_ENTROPY_LOSS", true);
														
 
															                 ggml_compute_forward_cross_entropy_loss(params, tensor->src[0], tensor->src[1], tensor);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             }
														
 
															             break;
														
 
															         case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
														
 
															             {
														
 
															-                TracyCZoneN(__tracy_ctx, "GGML_OP_CROSS_ENTROPY_LOSS_BACK", true);
														
 
															                 ggml_compute_forward_cross_entropy_loss_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
														
 
															-                TracyCZoneEnd(__tracy_ctx);
														
 
															             }
														
 
															             break;
														
 
															         case GGML_OP_NONE:
														
@@ -19063,17 +18908,13 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) {
 
															 }
														
 
															 void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
														
 
															-    TracyCZoneN(_tracy_graph, "ggml_graph_plan", true);
														
 
															     struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads);
														
 
															-    TracyCZoneEnd(_tracy_graph);
														
 
															     struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
														
 
															     cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
														
 
															-    TracyCZoneN(_tracy_compute, "ggml_graph_compute", true);
														
 
															     ggml_graph_compute(cgraph, &cplan);
														
 
															-    TracyCZoneEnd(_tracy_compute);
														
 
															 }
														
 
															 struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) {
														
--- a/ggml/test_unity_cpp.py
+++ b/ggml/test_unity_cpp.py
@@ -841,4 +841,4 @@ def assert_hypotheses(
 
															         g_step_scores = ggml.to_numpy(g_hyp.step_scores)
														
 
															         assert g_tokens == exp["seq"]
														
 
															         assert g_hyp.score == pytest.approx(exp["score"], rel=score_rtol)
														
 
															-        assert np.allclose(g_step_scores, exp["step_scores"], rtol=step_scores_rtol)
														
 
															+        assert np.allclose(g_step_scores, exp["step_scores"], rtol=step_scores_rtol)