1 ano atrás · 31f2419086
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -160,9 +160,6 @@ target_include_directories(kaldi-native-fbank PUBLIC
 
				   ${CMAKE_CURRENT_SOURCE_DIR}/examples/kaldi-native-fbank/csrc
			
 
				 )
			
 
				 
			
 
				-option( TRACY_ENABLE "" ON)
			
 
				-option( TRACY_ON_DEMAND "" ON)
			
 
				-add_subdirectory (tracy)
			
 
				 
			
 
				 if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
			
 
				     set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
			
--- a/ggml/examples/common.cpp
+++ b/ggml/examples/common.cpp
@@ -806,4 +806,4 @@ void sam_print_usage(int /*argc*/, char ** argv, const sam_params & params) {
 
				     fprintf(stderr, "  -o FNAME, --out FNAME\n");
			
 
				     fprintf(stderr, "                        output file (default: %s)\n", params.fname_out.c_str());
			
 
				     fprintf(stderr, "\n");
			
 
				-}
			
 
				+}
			
--- a/ggml/examples/common.h
+++ b/ggml/examples/common.h
@@ -179,4 +179,4 @@ struct sam_params {
 
				 
			
 
				 bool sam_params_parse(int argc, char ** argv, sam_params & params);
			
 
				 
			
 
				-void sam_print_usage(int argc, char ** argv, const sam_params & params);
			
 
				+void sam_print_usage(int argc, char ** argv, const sam_params & params);
			
--- a/ggml/examples/unity/fairseq2.cpp
+++ b/ggml/examples/unity/fairseq2.cpp
@@ -7,8 +7,6 @@
 
				 
			
 
				 #include "kaldi-native-fbank/csrc/feature-fbank.h"
			
 
				 #include "kaldi-native-fbank/csrc/feature-window.h"
			
 
				-#include "tracy/Tracy.hpp"
			
 
				-
			
 
				 #include "fairseq2.h"
			
 
				 #include "ggml.h"
			
 
				 
			
@@ -18,7 +16,7 @@ ggml_tensor* ggml_detach(ggml_tensor* a) {
 
				     return a;
			
 
				 }
			
 
				 
			
 
				-#define DEBUG_MEM_USAGE 1
			
 
				+#define DEBUG_MEM_USAGE 0
			
 
				 
			
 
				 void printf_mem_usage(ggml_context* ctx, std::string name) {
			
 
				 #if DEBUG_MEM_USAGE
			
@@ -1147,7 +1145,6 @@ extern "C" void _bootstrap_seqs_and_scores(
 
				     ggml_tensor* encoder_output,
			
 
				     ggml_tensor* encoder_padding_mask
			
 
				 ) {
			
 
				-    ZoneScoped;
			
 
				     int prefix_seq_len = job.prefix_seq->ne[0];
			
 
				     int max_seq_len = scores->ne[0];
			
 
				     int beam_size = scores->ne[1];
			
@@ -1188,6 +1185,7 @@ extern "C" void _bootstrap_seqs_and_scores(
 
				 
			
 
				     ggml_cgraph gf = ggml_build_forward(lprobs);
			
 
				     ggml_graph_compute_with_ctx(ctx, &gf, 1);
			
 
				+    ggml_free(ctx);
			
 
				     full_seqs->type = GGML_TYPE_I32;
			
 
				     job.prefix_seq->type = GGML_TYPE_I32;
			
 
				 
			
@@ -1210,8 +1208,7 @@ int topk(
 
				     std::int64_t k,
			
 
				     ggml_tensor* candidate_indices
			
 
				 ) {
			
 
				-    ZoneNamed(topk, true);
			
 
				-    // Take the best 2 x `beam_size` predictions. We'll choose the first
			
 
				+        // Take the best 2 x `beam_size` predictions. We'll choose the first
			
 
				     // `beam_size` of these which don't predict EOS to continue with.
			
 
				     // (N, 2 x B)
			
 
				     // `vocab_size` - 1 to never select PAD.
			
@@ -1227,8 +1224,7 @@ int topk(
 
				 }
			
 
				 
			
 
				 void _tweak_lprobs(const SequenceGeneratorJob& job, ggml_tensor* lprobs, int step_nr, int max_seq_len, std::size_t vocab_size) {
			
 
				-    ZoneNamed(tweak_lprobs, true);
			
 
				-    std::size_t beam_size = job.opts.beam_size;
			
 
				+        std::size_t beam_size = job.opts.beam_size;
			
 
				     std::size_t eos_idx = job.eos_idx;
			
 
				 
			
 
				     // Do not allow EOS before reaching the minimum sequence length.
			
@@ -1279,8 +1275,7 @@ void _finalize_hypothesis(
 
				     ggml_tensor* scores, // (beam_size, seq_len)
			
 
				     Hypothesis* hypothesis
			
 
				 ) {
			
 
				-    ZoneNamed(_finalize_hypothesis, true);
			
 
				-    ggml_tensor* seq = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, step_nr + 2);
			
 
				+        ggml_tensor* seq = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, step_nr + 2);
			
 
				     hypothesis->seq = seq;
			
 
				     ggml_tensor* step_scores = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, step_nr + 2);
			
 
				     hypothesis->step_scores = step_scores;
			
@@ -1331,11 +1326,10 @@ extern "C" Hypothesis* generate_sequence(
 
				     ggml_tensor* encoder_padding_mask,
			
 
				     ggml_context* result_ctx
			
 
				 ) {
			
 
				-    ZoneScoped;
			
 
				     std::vector<uint8_t> local_bufs[3] = {
			
 
				-        std::vector<uint8_t>(256 * 1024 * 1024),  // step_ctx
			
 
				-        std::vector<uint8_t>(256 * 1024 * 1024),  // next_step_ctx
			
 
				-        std::vector<uint8_t>(256 * 1024 * 1024)  // search_ctx
			
 
				+        std::vector<uint8_t>(1024 * 1024 * 1024),  // step_ctx
			
 
				+        std::vector<uint8_t>(1024 * 1024 * 1024),  // next_step_ctx
			
 
				+        std::vector<uint8_t>(1024 * 1024 * 1024)  // search_ctx
			
 
				     };
			
 
				     ggml_context* search_ctx = ctx_from_buffer(local_bufs[2]);
			
 
				 
			
@@ -1441,7 +1435,6 @@ extern "C" Hypothesis* generate_sequence(
 
				 
			
 
				         std::size_t ongoing_beams = 0;
			
 
				         for (std::int32_t i = 0; i < K; ++i) {
			
 
				-            ZoneNamed(beam_search_step, true);
			
 
				             int c = ggml_get_f32_1d(candidate_indices, i);
			
 
				             std::int32_t beam = c / vocab_size;
			
 
				             std::int32_t token = c % vocab_size;
			
@@ -1476,15 +1469,15 @@ extern "C" Hypothesis* generate_sequence(
 
				             new_scores = ggml_get_rows(search_ctx, scores, beam_indices);
			
 
				             ggml_cgraph gf_reorder = ggml_build_forward(new_seqs);
			
 
				             ggml_build_forward_expand(&gf_reorder, new_scores);
			
 
				-            next_step_ctx = ctx_from_buffer(local_bufs[(step_nr + 1) % 2]);
			
 
				-            reorder_kv_cache(model, next_step_ctx, &gf_reorder, beam_indices);
			
 
				-
			
 
				-            ggml_graph_compute_with_ctx(next_step_ctx, &gf_reorder, 1);
			
 
				+            reorder_kv_cache(model, step_ctx, &gf_reorder, beam_indices);
			
 
				+            ggml_graph_compute_with_ctx(step_ctx, &gf_reorder, 1);
			
 
				             ggml_detach(new_seqs);
			
 
				             ggml_detach(new_scores);
			
 
				             new_seqs->type = GGML_TYPE_I32;
			
 
				             printf_mem_usage(search_ctx, "search_ctx");
			
 
				+            next_step_ctx = ctx_from_buffer(local_bufs[(step_nr + 1) % 2]);
			
 
				             SWAP(step_ctx, next_step_ctx);
			
 
				+            ggml_free(next_step_ctx);
			
 
				         }
			
 
				 
			
 
				         // new_seqs[:, step_nr + 1] = next_tokens
			
--- a/ggml/examples/unity/unity.cpp
+++ b/ggml/examples/unity/unity.cpp
@@ -135,7 +135,7 @@ int main(int argc, char ** argv) {
 
				     }
			
 
				 
			
 
				     char result_str[4096];
			
 
				-    static std::vector<uint8_t> encoder_buf(4 * 1024LL * 1024LL * 1024LL);
			
 
				+    static std::vector<uint8_t> encoder_buf(20 * 1024LL * 1024LL * 1024LL);
			
 
				 
			
 
				     std::string input;
			
 
				     bool interactive = params.files.size() == 0;
			
@@ -189,13 +189,14 @@ int main(int argc, char ** argv) {
 
				 
			
 
				         // Beam search decoding
			
 
				         const Hypothesis* result = unity_decode(model, params.opts, tgt_lang_idx, encoder_output, params.n_threads);
			
 
				-
			
 
				+    
			
 
				         // Drop language and bos token.
			
 
				         ggml_tensor* tokens = ggml_slice(model.ctx, result[0].seq, 0, 2, 0);
			
 
				 
			
 
				         // Collect result string
			
 
				         int n = fairseq2_spm_detokenize(&model, tokens, (char*)&result_str);
			
 
				         std::cout << std::string((char*)&result_str, n) << std::endl;
			
 
				+        ggml_free(model.ctx);
			
 
				     }
			
 
				 
			
 
				     return 0;
			
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -261,12 +261,8 @@ target_include_directories(${TARGET} PUBLIC
 
				     ../include
			
 
				     ../include/ggml
			
 
				     ../examples/
			
 
				-    ../tracy/public/
			
 
				     ${GGML_EXTRA_INCS}
			
 
				     )
			
 
				-if (TRACY_ENABLE)
			
 
				-    target_link_libraries (${TARGET} PUBLIC Tracy::TracyClient )
			
 
				-endif()
			
 
				 
			
 
				 if (MSVC)
			
 
				     target_link_libraries(${TARGET} PUBLIC ${GGML_EXTRA_LIBS} ${CMAKE_THREAD_LIBS_INIT} kaldi-native-fbank)
			
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -25,7 +25,6 @@
 
				 #include <limits.h>
			
 
				 #include <stdarg.h>
			
 
				 #include <signal.h>
			
 
				-#include "tracy/TracyC.h"
			
 
				 
			
 
				 
			
 
				 #ifdef GGML_USE_METAL
			
@@ -11692,17 +11691,11 @@ static void ggml_compute_forward_mul_mat(
 
				                     x = wdata;
			
 
				                 }
			
 
				 
			
 
				-                TracyCPlot("cblas_sgemm_B", ne13 * ne12);
			
 
				-                TracyCPlot("cblas_sgemm_M", ne11);
			
 
				-                TracyCPlot("cblas_sgemm_N", ne01);
			
 
				-                TracyCPlot("cblas_sgemm_K", ne10);
			
 
				-                TracyCZoneN(_tracy_sgemm, "cblas_sgemm", true);
			
 
				                 cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
			
 
				                         ne11, ne01, ne10,
			
 
				                         1.0f,    y, ne10,
			
 
				                                  x, ne00,
			
 
				                         0.0f,    d, ne01);
			
 
				-                TracyCZoneEnd(_tracy_sgemm);
			
 
				             }
			
 
				         }
			
 
				 
			
@@ -16735,472 +16728,324 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
 
				     switch (tensor->op) {
			
 
				         case GGML_OP_DUP:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_DUP", true);
			
 
				                 ggml_compute_forward_dup(params, tensor->src[0], tensor);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             } break;
			
 
				         case GGML_OP_ADD:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_ADD", true);
			
 
				                 ggml_compute_forward_add(params, tensor->src[0], tensor->src[1], tensor);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             } break;
			
 
				         case GGML_OP_ADD1:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_ADD1", true);
			
 
				                 ggml_compute_forward_add1(params, tensor->src[0], tensor->src[1], tensor);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             } break;
			
 
				         case GGML_OP_ACC:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_ACC", true);
			
 
				                 ggml_compute_forward_acc(params, tensor->src[0], tensor->src[1], tensor);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             } break;
			
 
				         case GGML_OP_SUB:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_SUB", true);
			
 
				                 ggml_compute_forward_sub(params, tensor->src[0], tensor->src[1], tensor);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             } break;
			
 
				         case GGML_OP_MUL:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_MUL", true);
			
 
				                 ggml_compute_forward_mul(params, tensor->src[0], tensor->src[1], tensor);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             } break;
			
 
				         case GGML_OP_DIV:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_DIV", true);
			
 
				                 ggml_compute_forward_div(params, tensor->src[0], tensor->src[1], tensor);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             } break;
			
 
				         case GGML_OP_SQR:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_SQR", true);
			
 
				                 ggml_compute_forward_sqr(params, tensor->src[0], tensor);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             } break;
			
 
				         case GGML_OP_SQRT:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_SQRT", true);
			
 
				                 ggml_compute_forward_sqrt(params, tensor->src[0], tensor);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             } break;
			
 
				         case GGML_OP_LOG:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_LOG", true);
			
 
				                 ggml_compute_forward_log(params, tensor->src[0], tensor);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             } break;
			
 
				         case GGML_OP_SUM:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_SUM", true);
			
 
				                 ggml_compute_forward_sum(params, tensor->src[0], tensor);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             } break;
			
 
				         case GGML_OP_SUM_ROWS:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_SUM_ROWS", true);
			
 
				                 ggml_compute_forward_sum_rows(params, tensor->src[0], tensor);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             } break;
			
 
				         case GGML_OP_MEAN:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_MEAN", true);
			
 
				                 ggml_compute_forward_mean(params, tensor->src[0], tensor);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             } break;
			
 
				         case GGML_OP_ARGMAX:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_ARGMAX", true);
			
 
				                 ggml_compute_forward_argmax(params, tensor->src[0], tensor);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             } break;
			
 
				         case GGML_OP_REPEAT:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_REPEAT", true);
			
 
				                 ggml_compute_forward_repeat(params, tensor->src[0], tensor);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             } break;
			
 
				         case GGML_OP_REPEAT_BACK:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_REPEAT_BACK", true);
			
 
				                 ggml_compute_forward_repeat_back(params, tensor->src[0], tensor);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             } break;
			
 
				         case GGML_OP_CONCAT:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_CONCAT", true);
			
 
				                 ggml_compute_forward_concat(params, tensor->src[0], tensor->src[1], tensor);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             } break;
			
 
				         case GGML_OP_SILU_BACK:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_SILU_BACK", true);
			
 
				                 ggml_compute_forward_silu_back(params, tensor->src[0], tensor->src[1], tensor);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             } break;
			
 
				         case GGML_OP_NORM:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_NORM", true);
			
 
				                 ggml_compute_forward_norm(params, tensor->src[0], tensor);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             } break;
			
 
				         case GGML_OP_BATCH_NORM:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_BATCH_NORM", true);
			
 
				                 ggml_compute_forward_batch_norm(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], tensor->src[4], tensor);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             } break;
			
 
				         case GGML_OP_RMS_NORM:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_RMS_NORM", true);
			
 
				                 ggml_compute_forward_rms_norm(params, tensor->src[0], tensor);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             } break;
			
 
				         case GGML_OP_RMS_NORM_BACK:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_RMS_NORM_BACK", true);
			
 
				                 ggml_compute_forward_rms_norm_back(params, tensor->src[0], tensor->src[1], tensor);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             } break;
			
 
				         case GGML_OP_GROUP_NORM:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_GROUP_NORM", true);
			
 
				                 ggml_compute_forward_group_norm(params, tensor->src[0], tensor);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             } break;
			
 
				         case GGML_OP_MUL_MAT:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_MUL_MAT", true);
			
 
				                 ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             } break;
			
 
				         case GGML_OP_OUT_PROD:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_OUT_PROD", true);
			
 
				                 ggml_compute_forward_out_prod(params, tensor->src[0], tensor->src[1], tensor);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             } break;
			
 
				         case GGML_OP_SCALE:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_SCALE", true);
			
 
				                 ggml_compute_forward_scale(params, tensor->src[0], tensor->src[1], tensor);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             } break;
			
 
				         case GGML_OP_SET:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_SET", true);
			
 
				                 ggml_compute_forward_set(params, tensor->src[0], tensor->src[1], tensor);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             } break;
			
 
				         case GGML_OP_CPY:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_CPY", true);
			
 
				                 ggml_compute_forward_cpy(params, tensor->src[0], tensor);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             } break;
			
 
				         case GGML_OP_CONT:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_CONT", true);
			
 
				                 ggml_compute_forward_cont(params, tensor->src[0], tensor);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             } break;
			
 
				         case GGML_OP_RESHAPE:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_RESHAPE", true);
			
 
				                 ggml_compute_forward_reshape(params, tensor->src[0], tensor);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             } break;
			
 
				         case GGML_OP_VIEW:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_VIEW", true);
			
 
				                 ggml_compute_forward_view(params, tensor->src[0]);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             } break;
			
 
				         case GGML_OP_PERMUTE:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_PERMUTE", true);
			
 
				                 ggml_compute_forward_permute(params, tensor->src[0]);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             } break;
			
 
				         case GGML_OP_TRANSPOSE:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_TRANSPOSE", true);
			
 
				                 ggml_compute_forward_transpose(params, tensor->src[0]);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             } break;
			
 
				         case GGML_OP_GET_ROWS:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_GET_ROWS", true);
			
 
				                 ggml_compute_forward_get_rows(params, tensor->src[0], tensor->src[1], tensor);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             } break;
			
 
				         case GGML_OP_GET_ROWS_BACK:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_GET_ROWS_BACK", true);
			
 
				                 ggml_compute_forward_get_rows_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             } break;
			
 
				         case GGML_OP_DIAG:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_DIAG", true);
			
 
				                 ggml_compute_forward_diag(params, tensor->src[0], tensor);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             } break;
			
 
				         case GGML_OP_DIAG_MASK_INF:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_DIAG_MASK_INF", true);
			
 
				                 ggml_compute_forward_diag_mask_inf(params, tensor->src[0], tensor);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             } break;
			
 
				         case GGML_OP_DIAG_MASK_ZERO:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_DIAG_MASK_ZERO", true);
			
 
				                 ggml_compute_forward_diag_mask_zero(params, tensor->src[0], tensor);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             } break;
			
 
				         case GGML_OP_SOFT_MAX:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_SOFT_MAX", true);
			
 
				                 ggml_compute_forward_soft_max(params, tensor->src[0], tensor);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             } break;
			
 
				         case GGML_OP_SOFT_MAX_BACK:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_SOFT_MAX_BACK", true);
			
 
				                 ggml_compute_forward_soft_max_back(params, tensor->src[0], tensor->src[1], tensor);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             } break;
			
 
				         case GGML_OP_ROPE:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_ROPE", true);
			
 
				                 ggml_compute_forward_rope(params, tensor->src[0], tensor);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             } break;
			
 
				         case GGML_OP_ROPE_BACK:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_ROPE_BACK", true);
			
 
				                 ggml_compute_forward_rope_back(params, tensor->src[0], tensor);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             } break;
			
 
				         case GGML_OP_ALIBI:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_ALIBI", true);
			
 
				                 ggml_compute_forward_alibi(params, tensor->src[0], tensor);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             } break;
			
 
				         case GGML_OP_CLAMP:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_CLAMP", true);
			
 
				                 ggml_compute_forward_clamp(params, tensor->src[0], tensor);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             } break;
			
 
				         case GGML_OP_CONV_1D:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_CONV_1D", true);
			
 
				                 ggml_compute_forward_conv_1d(params, tensor->src[0], tensor->src[1], tensor);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             } break;
			
 
				         case GGML_OP_CONV_1D_STAGE_0:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_CONV_1D_STAGE_0", true);
			
 
				                 ggml_compute_forward_conv_1d_stage_0(params, tensor->src[0], tensor->src[1], tensor);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             } break;
			
 
				         case GGML_OP_CONV_1D_STAGE_1:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_CONV_1D_STAGE_1", true);
			
 
				                 ggml_compute_forward_conv_1d_stage_1(params, tensor->src[0], tensor);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             } break;
			
 
				         case GGML_OP_CONV_1D_STAGE_2:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_CONV_1D_STAGE_2", true);
			
 
				                 ggml_compute_forward_conv_1d_stage_2(params, tensor->src[0], tensor->src[1], tensor);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             } break;
			
 
				         case GGML_OP_CONV_1D_GENERIC:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_CONV_1D_GENERIC", true);
			
 
				                 ggml_compute_forward_conv_1d_generic(params, tensor->src[0], tensor->src[1], tensor);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             } break;
			
 
				         case GGML_OP_CONV_1D_GENERIC_STAGE_0:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_CONV_1D_GENERIC_STAGE_0", true);
			
 
				                 ggml_compute_forward_conv_1d_generic_stage_0(params, tensor->src[0], tensor->src[1], tensor);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             } break;
			
 
				         case GGML_OP_CONV_1D_GENERIC_STAGE_1:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_CONV_1D_GENERIC_STAGE_1", true);
			
 
				                 ggml_compute_forward_conv_1d_generic_stage_1(params, tensor->src[0], tensor->src[1], tensor);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             } break;
			
 
				         case GGML_OP_CONV_2D:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_CONV_2D", true);
			
 
				                 ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             } break;
			
 
				         case GGML_OP_CONV_TRANSPOSE_2D:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_CONV_TRANSPOSE_2D", true);
			
 
				                 ggml_compute_forward_conv_transpose_2d(params, tensor->src[0], tensor->src[1], tensor);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             } break;
			
 
				         case GGML_OP_POOL_1D:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_POOL_1D", true);
			
 
				                 ggml_compute_forward_pool_1d(params, tensor->src[0], tensor);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             } break;
			
 
				         case GGML_OP_POOL_2D:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_POOL_2D", true);
			
 
				                 ggml_compute_forward_pool_2d(params, tensor->src[0], tensor);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             } break;
			
 
				         case GGML_OP_UPSCALE:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_UPSCALE", true);
			
 
				                 ggml_compute_forward_upscale(params, tensor->src[0], tensor);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             } break;
			
 
				         case GGML_OP_FLASH_ATTN:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_FLASH_ATTN", true);
			
 
				                 const int32_t t = ggml_get_op_params_i32(tensor, 0);
			
 
				                 GGML_ASSERT(t == 0 || t == 1);
			
 
				                 const bool masked = t != 0;
			
 
				                 ggml_compute_forward_flash_attn(params, tensor->src[0], tensor->src[1], tensor->src[2], masked, tensor);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             } break;
			
 
				         case GGML_OP_FLASH_FF:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_FLASH_FF", true);
			
 
				                 ggml_compute_forward_flash_ff(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], tensor->src[4], tensor);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             } break;
			
 
				         case GGML_OP_FLASH_ATTN_BACK:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_FLASH_ATTN_BACK", true);
			
 
				                 int32_t t = ggml_get_op_params_i32(tensor, 0);
			
 
				                 GGML_ASSERT(t == 0 || t == 1);
			
 
				                 bool masked = t != 0;
			
 
				                 ggml_compute_forward_flash_attn_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], masked, tensor);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             } break;
			
 
				         case GGML_OP_WIN_PART:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_WIN_PART", true);
			
 
				                 ggml_compute_forward_win_part(params, tensor->src[0], tensor);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             } break;
			
 
				         case GGML_OP_WIN_UNPART:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_WIN_UNPART", true);
			
 
				                 ggml_compute_forward_win_unpart(params, tensor->src[0], tensor);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             } break;
			
 
				         case GGML_OP_UNARY:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_UNARY", true);
			
 
				                 ggml_compute_forward_unary(params, tensor->src[0], tensor);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             } break;
			
 
				         case GGML_OP_GET_REL_POS:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_GET_REL_POS", true);
			
 
				                 ggml_compute_forward_get_rel_pos(params, tensor->src[0], tensor);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             } break;
			
 
				         case GGML_OP_ADD_REL_POS:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_ADD_REL_POS", true);
			
 
				                 ggml_compute_forward_add_rel_pos(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             } break;
			
 
				         case GGML_OP_MAP_UNARY:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_MAP_UNARY", true);
			
 
				                 ggml_unary_op_f32_t fun;
			
 
				                 memcpy(&fun, tensor->op_params, sizeof(fun));
			
 
				                 ggml_compute_forward_map_unary(params, tensor->src[0], tensor, fun);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             }
			
 
				             break;
			
 
				         case GGML_OP_MAP_BINARY:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_MAP_BINARY", true);
			
 
				                 ggml_binary_op_f32_t fun;
			
 
				                 memcpy(&fun, tensor->op_params, sizeof(fun));
			
 
				                 ggml_compute_forward_map_binary(params, tensor->src[0], tensor->src[1], tensor, fun);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             }
			
 
				             break;
			
 
				         case GGML_OP_MAP_CUSTOM1_F32:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_MAP_CUSTOM1_F32", true);
			
 
				                 ggml_custom1_op_f32_t fun;
			
 
				                 memcpy(&fun, tensor->op_params, sizeof(fun));
			
 
				                 ggml_compute_forward_map_custom1_f32(params, tensor->src[0], tensor, fun);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             }
			
 
				             break;
			
 
				         case GGML_OP_MAP_CUSTOM2_F32:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_MAP_CUSTOM2_F32", true);
			
 
				                 ggml_custom2_op_f32_t fun;
			
 
				                 memcpy(&fun, tensor->op_params, sizeof(fun));
			
 
				                 ggml_compute_forward_map_custom2_f32(params, tensor->src[0], tensor->src[1], tensor, fun);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             }
			
 
				             break;
			
 
				         case GGML_OP_MAP_CUSTOM3_F32:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_MAP_CUSTOM3_F32", true);
			
 
				                 ggml_custom3_op_f32_t fun;
			
 
				                 memcpy(&fun, tensor->op_params, sizeof(fun));
			
 
				                 ggml_compute_forward_map_custom3_f32(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor, fun);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             }
			
 
				             break;
			
 
				         case GGML_OP_MAP_CUSTOM1:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_MAP_CUSTOM1", true);
			
 
				                 ggml_compute_forward_map_custom1(params, tensor->src[0], tensor);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             }
			
 
				             break;
			
 
				         case GGML_OP_MAP_CUSTOM2:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_MAP_CUSTOM2", true);
			
 
				                 ggml_compute_forward_map_custom2(params, tensor->src[0], tensor->src[1], tensor);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             }
			
 
				             break;
			
 
				         case GGML_OP_MAP_CUSTOM3:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_MAP_CUSTOM3", true);
			
 
				                 ggml_compute_forward_map_custom3(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             }
			
 
				             break;
			
 
				         case GGML_OP_CROSS_ENTROPY_LOSS:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_CROSS_ENTROPY_LOSS", true);
			
 
				                 ggml_compute_forward_cross_entropy_loss(params, tensor->src[0], tensor->src[1], tensor);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             }
			
 
				             break;
			
 
				         case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
			
 
				             {
			
 
				-                TracyCZoneN(__tracy_ctx, "GGML_OP_CROSS_ENTROPY_LOSS_BACK", true);
			
 
				                 ggml_compute_forward_cross_entropy_loss_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
			
 
				-                TracyCZoneEnd(__tracy_ctx);
			
 
				             }
			
 
				             break;
			
 
				         case GGML_OP_NONE:
			
@@ -19063,17 +18908,13 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) {
 
				 }
			
 
				 
			
 
				 void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
			
 
				-    TracyCZoneN(_tracy_graph, "ggml_graph_plan", true);
			
 
				     struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads);
			
 
				-    TracyCZoneEnd(_tracy_graph);
			
 
				 
			
 
				     struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
			
 
				 
			
 
				     cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
			
 
				 
			
 
				-    TracyCZoneN(_tracy_compute, "ggml_graph_compute", true);
			
 
				     ggml_graph_compute(cgraph, &cplan);
			
 
				-    TracyCZoneEnd(_tracy_compute);
			
 
				 }
			
 
				 
			
 
				 struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) {
			
--- a/ggml/test_unity_cpp.py
+++ b/ggml/test_unity_cpp.py
@@ -841,4 +841,4 @@ def assert_hypotheses(
 
				         g_step_scores = ggml.to_numpy(g_hyp.step_scores)
			
 
				         assert g_tokens == exp["seq"]
			
 
				         assert g_hyp.score == pytest.approx(exp["score"], rel=score_rtol)
			
 
				-        assert np.allclose(g_step_scores, exp["step_scores"], rtol=step_scores_rtol)
			
 
				+        assert np.allclose(g_step_scores, exp["step_scores"], rtol=step_scores_rtol)