Ver código fonte

Fix unity.cpp ctx management (#177)

* CLI & bug fix

* ctx fix

* revert undesired changes
Ning 1 ano atrás
pai
commit
31f2419086

+ 0 - 3
ggml/CMakeLists.txt

@@ -160,9 +160,6 @@ target_include_directories(kaldi-native-fbank PUBLIC
   ${CMAKE_CURRENT_SOURCE_DIR}/examples/kaldi-native-fbank/csrc
 )
 
-option( TRACY_ENABLE "" ON)
-option( TRACY_ON_DEMAND "" ON)
-add_subdirectory (tracy)
 
 if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
     set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)

+ 1 - 1
ggml/examples/common.cpp

@@ -806,4 +806,4 @@ void sam_print_usage(int /*argc*/, char ** argv, const sam_params & params) {
     fprintf(stderr, "  -o FNAME, --out FNAME\n");
     fprintf(stderr, "                        output file (default: %s)\n", params.fname_out.c_str());
     fprintf(stderr, "\n");
-}
+}

+ 1 - 1
ggml/examples/common.h

@@ -179,4 +179,4 @@ struct sam_params {
 
 bool sam_params_parse(int argc, char ** argv, sam_params & params);
 
-void sam_print_usage(int argc, char ** argv, const sam_params & params);
+void sam_print_usage(int argc, char ** argv, const sam_params & params);

+ 12 - 19
ggml/examples/unity/fairseq2.cpp

@@ -7,8 +7,6 @@
 
 #include "kaldi-native-fbank/csrc/feature-fbank.h"
 #include "kaldi-native-fbank/csrc/feature-window.h"
-#include "tracy/Tracy.hpp"
-
 #include "fairseq2.h"
 #include "ggml.h"
 
@@ -18,7 +16,7 @@ ggml_tensor* ggml_detach(ggml_tensor* a) {
     return a;
 }
 
-#define DEBUG_MEM_USAGE 1
+#define DEBUG_MEM_USAGE 0
 
 void printf_mem_usage(ggml_context* ctx, std::string name) {
 #if DEBUG_MEM_USAGE
@@ -1147,7 +1145,6 @@ extern "C" void _bootstrap_seqs_and_scores(
     ggml_tensor* encoder_output,
     ggml_tensor* encoder_padding_mask
 ) {
-    ZoneScoped;
     int prefix_seq_len = job.prefix_seq->ne[0];
     int max_seq_len = scores->ne[0];
     int beam_size = scores->ne[1];
@@ -1188,6 +1185,7 @@ extern "C" void _bootstrap_seqs_and_scores(
 
     ggml_cgraph gf = ggml_build_forward(lprobs);
     ggml_graph_compute_with_ctx(ctx, &gf, 1);
+    ggml_free(ctx);
     full_seqs->type = GGML_TYPE_I32;
     job.prefix_seq->type = GGML_TYPE_I32;
 
@@ -1210,8 +1208,7 @@ int topk(
     std::int64_t k,
     ggml_tensor* candidate_indices
 ) {
-    ZoneNamed(topk, true);
-    // Take the best 2 x `beam_size` predictions. We'll choose the first
+        // Take the best 2 x `beam_size` predictions. We'll choose the first
     // `beam_size` of these which don't predict EOS to continue with.
     // (N, 2 x B)
     // `vocab_size` - 1 to never select PAD.
@@ -1227,8 +1224,7 @@ int topk(
 }
 
 void _tweak_lprobs(const SequenceGeneratorJob& job, ggml_tensor* lprobs, int step_nr, int max_seq_len, std::size_t vocab_size) {
-    ZoneNamed(tweak_lprobs, true);
-    std::size_t beam_size = job.opts.beam_size;
+        std::size_t beam_size = job.opts.beam_size;
     std::size_t eos_idx = job.eos_idx;
 
     // Do not allow EOS before reaching the minimum sequence length.
@@ -1279,8 +1275,7 @@ void _finalize_hypothesis(
     ggml_tensor* scores, // (beam_size, seq_len)
     Hypothesis* hypothesis
 ) {
-    ZoneNamed(_finalize_hypothesis, true);
-    ggml_tensor* seq = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, step_nr + 2);
+        ggml_tensor* seq = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, step_nr + 2);
     hypothesis->seq = seq;
     ggml_tensor* step_scores = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, step_nr + 2);
     hypothesis->step_scores = step_scores;
@@ -1331,11 +1326,10 @@ extern "C" Hypothesis* generate_sequence(
     ggml_tensor* encoder_padding_mask,
     ggml_context* result_ctx
 ) {
-    ZoneScoped;
     std::vector<uint8_t> local_bufs[3] = {
-        std::vector<uint8_t>(256 * 1024 * 1024),  // step_ctx
-        std::vector<uint8_t>(256 * 1024 * 1024),  // next_step_ctx
-        std::vector<uint8_t>(256 * 1024 * 1024)  // search_ctx
+        std::vector<uint8_t>(1024 * 1024 * 1024),  // step_ctx
+        std::vector<uint8_t>(1024 * 1024 * 1024),  // next_step_ctx
+        std::vector<uint8_t>(1024 * 1024 * 1024)  // search_ctx
     };
     ggml_context* search_ctx = ctx_from_buffer(local_bufs[2]);
 
@@ -1441,7 +1435,6 @@ extern "C" Hypothesis* generate_sequence(
 
         std::size_t ongoing_beams = 0;
         for (std::int32_t i = 0; i < K; ++i) {
-            ZoneNamed(beam_search_step, true);
             int c = ggml_get_f32_1d(candidate_indices, i);
             std::int32_t beam = c / vocab_size;
             std::int32_t token = c % vocab_size;
@@ -1476,15 +1469,15 @@ extern "C" Hypothesis* generate_sequence(
             new_scores = ggml_get_rows(search_ctx, scores, beam_indices);
             ggml_cgraph gf_reorder = ggml_build_forward(new_seqs);
             ggml_build_forward_expand(&gf_reorder, new_scores);
-            next_step_ctx = ctx_from_buffer(local_bufs[(step_nr + 1) % 2]);
-            reorder_kv_cache(model, next_step_ctx, &gf_reorder, beam_indices);
-
-            ggml_graph_compute_with_ctx(next_step_ctx, &gf_reorder, 1);
+            reorder_kv_cache(model, step_ctx, &gf_reorder, beam_indices);
+            ggml_graph_compute_with_ctx(step_ctx, &gf_reorder, 1);
             ggml_detach(new_seqs);
             ggml_detach(new_scores);
             new_seqs->type = GGML_TYPE_I32;
             printf_mem_usage(search_ctx, "search_ctx");
+            next_step_ctx = ctx_from_buffer(local_bufs[(step_nr + 1) % 2]);
             SWAP(step_ctx, next_step_ctx);
+            ggml_free(next_step_ctx);
         }
 
         // new_seqs[:, step_nr + 1] = next_tokens

+ 3 - 2
ggml/examples/unity/unity.cpp

@@ -135,7 +135,7 @@ int main(int argc, char ** argv) {
     }
 
     char result_str[4096];
-    static std::vector<uint8_t> encoder_buf(4 * 1024LL * 1024LL * 1024LL);
+    static std::vector<uint8_t> encoder_buf(20 * 1024LL * 1024LL * 1024LL);
 
     std::string input;
     bool interactive = params.files.size() == 0;
@@ -189,13 +189,14 @@ int main(int argc, char ** argv) {
 
         // Beam search decoding
         const Hypothesis* result = unity_decode(model, params.opts, tgt_lang_idx, encoder_output, params.n_threads);
-
+    
         // Drop language and bos token.
         ggml_tensor* tokens = ggml_slice(model.ctx, result[0].seq, 0, 2, 0);
 
         // Collect result string
         int n = fairseq2_spm_detokenize(&model, tokens, (char*)&result_str);
         std::cout << std::string((char*)&result_str, n) << std::endl;
+        ggml_free(model.ctx);
     }
 
     return 0;

+ 0 - 4
ggml/src/CMakeLists.txt

@@ -261,12 +261,8 @@ target_include_directories(${TARGET} PUBLIC
     ../include
     ../include/ggml
     ../examples/
-    ../tracy/public/
     ${GGML_EXTRA_INCS}
     )
-if (TRACY_ENABLE)
-    target_link_libraries (${TARGET} PUBLIC Tracy::TracyClient )
-endif()
 
 if (MSVC)
     target_link_libraries(${TARGET} PUBLIC ${GGML_EXTRA_LIBS} ${CMAKE_THREAD_LIBS_INIT} kaldi-native-fbank)

+ 0 - 159
ggml/src/ggml.c

@@ -25,7 +25,6 @@
 #include <limits.h>
 #include <stdarg.h>
 #include <signal.h>
-#include "tracy/TracyC.h"
 
 
 #ifdef GGML_USE_METAL
@@ -11692,17 +11691,11 @@ static void ggml_compute_forward_mul_mat(
                     x = wdata;
                 }
 
-                TracyCPlot("cblas_sgemm_B", ne13 * ne12);
-                TracyCPlot("cblas_sgemm_M", ne11);
-                TracyCPlot("cblas_sgemm_N", ne01);
-                TracyCPlot("cblas_sgemm_K", ne10);
-                TracyCZoneN(_tracy_sgemm, "cblas_sgemm", true);
                 cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
                         ne11, ne01, ne10,
                         1.0f,    y, ne10,
                                  x, ne00,
                         0.0f,    d, ne01);
-                TracyCZoneEnd(_tracy_sgemm);
             }
         }
 
@@ -16735,472 +16728,324 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
     switch (tensor->op) {
         case GGML_OP_DUP:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_DUP", true);
                 ggml_compute_forward_dup(params, tensor->src[0], tensor);
-                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_ADD:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_ADD", true);
                 ggml_compute_forward_add(params, tensor->src[0], tensor->src[1], tensor);
-                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_ADD1:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_ADD1", true);
                 ggml_compute_forward_add1(params, tensor->src[0], tensor->src[1], tensor);
-                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_ACC:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_ACC", true);
                 ggml_compute_forward_acc(params, tensor->src[0], tensor->src[1], tensor);
-                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_SUB:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_SUB", true);
                 ggml_compute_forward_sub(params, tensor->src[0], tensor->src[1], tensor);
-                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_MUL:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_MUL", true);
                 ggml_compute_forward_mul(params, tensor->src[0], tensor->src[1], tensor);
-                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_DIV:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_DIV", true);
                 ggml_compute_forward_div(params, tensor->src[0], tensor->src[1], tensor);
-                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_SQR:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_SQR", true);
                 ggml_compute_forward_sqr(params, tensor->src[0], tensor);
-                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_SQRT:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_SQRT", true);
                 ggml_compute_forward_sqrt(params, tensor->src[0], tensor);
-                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_LOG:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_LOG", true);
                 ggml_compute_forward_log(params, tensor->src[0], tensor);
-                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_SUM:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_SUM", true);
                 ggml_compute_forward_sum(params, tensor->src[0], tensor);
-                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_SUM_ROWS:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_SUM_ROWS", true);
                 ggml_compute_forward_sum_rows(params, tensor->src[0], tensor);
-                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_MEAN:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_MEAN", true);
                 ggml_compute_forward_mean(params, tensor->src[0], tensor);
-                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_ARGMAX:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_ARGMAX", true);
                 ggml_compute_forward_argmax(params, tensor->src[0], tensor);
-                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_REPEAT:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_REPEAT", true);
                 ggml_compute_forward_repeat(params, tensor->src[0], tensor);
-                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_REPEAT_BACK:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_REPEAT_BACK", true);
                 ggml_compute_forward_repeat_back(params, tensor->src[0], tensor);
-                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_CONCAT:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_CONCAT", true);
                 ggml_compute_forward_concat(params, tensor->src[0], tensor->src[1], tensor);
-                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_SILU_BACK:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_SILU_BACK", true);
                 ggml_compute_forward_silu_back(params, tensor->src[0], tensor->src[1], tensor);
-                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_NORM:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_NORM", true);
                 ggml_compute_forward_norm(params, tensor->src[0], tensor);
-                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_BATCH_NORM:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_BATCH_NORM", true);
                 ggml_compute_forward_batch_norm(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], tensor->src[4], tensor);
-                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_RMS_NORM:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_RMS_NORM", true);
                 ggml_compute_forward_rms_norm(params, tensor->src[0], tensor);
-                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_RMS_NORM_BACK:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_RMS_NORM_BACK", true);
                 ggml_compute_forward_rms_norm_back(params, tensor->src[0], tensor->src[1], tensor);
-                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_GROUP_NORM:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_GROUP_NORM", true);
                 ggml_compute_forward_group_norm(params, tensor->src[0], tensor);
-                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_MUL_MAT:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_MUL_MAT", true);
                 ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor);
-                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_OUT_PROD:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_OUT_PROD", true);
                 ggml_compute_forward_out_prod(params, tensor->src[0], tensor->src[1], tensor);
-                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_SCALE:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_SCALE", true);
                 ggml_compute_forward_scale(params, tensor->src[0], tensor->src[1], tensor);
-                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_SET:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_SET", true);
                 ggml_compute_forward_set(params, tensor->src[0], tensor->src[1], tensor);
-                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_CPY:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_CPY", true);
                 ggml_compute_forward_cpy(params, tensor->src[0], tensor);
-                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_CONT:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_CONT", true);
                 ggml_compute_forward_cont(params, tensor->src[0], tensor);
-                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_RESHAPE:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_RESHAPE", true);
                 ggml_compute_forward_reshape(params, tensor->src[0], tensor);
-                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_VIEW:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_VIEW", true);
                 ggml_compute_forward_view(params, tensor->src[0]);
-                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_PERMUTE:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_PERMUTE", true);
                 ggml_compute_forward_permute(params, tensor->src[0]);
-                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_TRANSPOSE:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_TRANSPOSE", true);
                 ggml_compute_forward_transpose(params, tensor->src[0]);
-                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_GET_ROWS:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_GET_ROWS", true);
                 ggml_compute_forward_get_rows(params, tensor->src[0], tensor->src[1], tensor);
-                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_GET_ROWS_BACK:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_GET_ROWS_BACK", true);
                 ggml_compute_forward_get_rows_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
-                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_DIAG:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_DIAG", true);
                 ggml_compute_forward_diag(params, tensor->src[0], tensor);
-                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_DIAG_MASK_INF:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_DIAG_MASK_INF", true);
                 ggml_compute_forward_diag_mask_inf(params, tensor->src[0], tensor);
-                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_DIAG_MASK_ZERO:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_DIAG_MASK_ZERO", true);
                 ggml_compute_forward_diag_mask_zero(params, tensor->src[0], tensor);
-                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_SOFT_MAX:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_SOFT_MAX", true);
                 ggml_compute_forward_soft_max(params, tensor->src[0], tensor);
-                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_SOFT_MAX_BACK:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_SOFT_MAX_BACK", true);
                 ggml_compute_forward_soft_max_back(params, tensor->src[0], tensor->src[1], tensor);
-                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_ROPE:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_ROPE", true);
                 ggml_compute_forward_rope(params, tensor->src[0], tensor);
-                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_ROPE_BACK:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_ROPE_BACK", true);
                 ggml_compute_forward_rope_back(params, tensor->src[0], tensor);
-                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_ALIBI:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_ALIBI", true);
                 ggml_compute_forward_alibi(params, tensor->src[0], tensor);
-                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_CLAMP:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_CLAMP", true);
                 ggml_compute_forward_clamp(params, tensor->src[0], tensor);
-                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_CONV_1D:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_CONV_1D", true);
                 ggml_compute_forward_conv_1d(params, tensor->src[0], tensor->src[1], tensor);
-                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_CONV_1D_STAGE_0:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_CONV_1D_STAGE_0", true);
                 ggml_compute_forward_conv_1d_stage_0(params, tensor->src[0], tensor->src[1], tensor);
-                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_CONV_1D_STAGE_1:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_CONV_1D_STAGE_1", true);
                 ggml_compute_forward_conv_1d_stage_1(params, tensor->src[0], tensor);
-                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_CONV_1D_STAGE_2:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_CONV_1D_STAGE_2", true);
                 ggml_compute_forward_conv_1d_stage_2(params, tensor->src[0], tensor->src[1], tensor);
-                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_CONV_1D_GENERIC:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_CONV_1D_GENERIC", true);
                 ggml_compute_forward_conv_1d_generic(params, tensor->src[0], tensor->src[1], tensor);
-                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_CONV_1D_GENERIC_STAGE_0:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_CONV_1D_GENERIC_STAGE_0", true);
                 ggml_compute_forward_conv_1d_generic_stage_0(params, tensor->src[0], tensor->src[1], tensor);
-                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_CONV_1D_GENERIC_STAGE_1:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_CONV_1D_GENERIC_STAGE_1", true);
                 ggml_compute_forward_conv_1d_generic_stage_1(params, tensor->src[0], tensor->src[1], tensor);
-                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_CONV_2D:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_CONV_2D", true);
                 ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor);
-                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_CONV_TRANSPOSE_2D:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_CONV_TRANSPOSE_2D", true);
                 ggml_compute_forward_conv_transpose_2d(params, tensor->src[0], tensor->src[1], tensor);
-                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_POOL_1D:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_POOL_1D", true);
                 ggml_compute_forward_pool_1d(params, tensor->src[0], tensor);
-                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_POOL_2D:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_POOL_2D", true);
                 ggml_compute_forward_pool_2d(params, tensor->src[0], tensor);
-                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_UPSCALE:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_UPSCALE", true);
                 ggml_compute_forward_upscale(params, tensor->src[0], tensor);
-                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_FLASH_ATTN:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_FLASH_ATTN", true);
                 const int32_t t = ggml_get_op_params_i32(tensor, 0);
                 GGML_ASSERT(t == 0 || t == 1);
                 const bool masked = t != 0;
                 ggml_compute_forward_flash_attn(params, tensor->src[0], tensor->src[1], tensor->src[2], masked, tensor);
-                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_FLASH_FF:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_FLASH_FF", true);
                 ggml_compute_forward_flash_ff(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], tensor->src[4], tensor);
-                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_FLASH_ATTN_BACK:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_FLASH_ATTN_BACK", true);
                 int32_t t = ggml_get_op_params_i32(tensor, 0);
                 GGML_ASSERT(t == 0 || t == 1);
                 bool masked = t != 0;
                 ggml_compute_forward_flash_attn_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], masked, tensor);
-                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_WIN_PART:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_WIN_PART", true);
                 ggml_compute_forward_win_part(params, tensor->src[0], tensor);
-                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_WIN_UNPART:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_WIN_UNPART", true);
                 ggml_compute_forward_win_unpart(params, tensor->src[0], tensor);
-                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_UNARY:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_UNARY", true);
                 ggml_compute_forward_unary(params, tensor->src[0], tensor);
-                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_GET_REL_POS:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_GET_REL_POS", true);
                 ggml_compute_forward_get_rel_pos(params, tensor->src[0], tensor);
-                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_ADD_REL_POS:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_ADD_REL_POS", true);
                 ggml_compute_forward_add_rel_pos(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
-                TracyCZoneEnd(__tracy_ctx);
             } break;
         case GGML_OP_MAP_UNARY:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_MAP_UNARY", true);
                 ggml_unary_op_f32_t fun;
                 memcpy(&fun, tensor->op_params, sizeof(fun));
                 ggml_compute_forward_map_unary(params, tensor->src[0], tensor, fun);
-                TracyCZoneEnd(__tracy_ctx);
             }
             break;
         case GGML_OP_MAP_BINARY:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_MAP_BINARY", true);
                 ggml_binary_op_f32_t fun;
                 memcpy(&fun, tensor->op_params, sizeof(fun));
                 ggml_compute_forward_map_binary(params, tensor->src[0], tensor->src[1], tensor, fun);
-                TracyCZoneEnd(__tracy_ctx);
             }
             break;
         case GGML_OP_MAP_CUSTOM1_F32:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_MAP_CUSTOM1_F32", true);
                 ggml_custom1_op_f32_t fun;
                 memcpy(&fun, tensor->op_params, sizeof(fun));
                 ggml_compute_forward_map_custom1_f32(params, tensor->src[0], tensor, fun);
-                TracyCZoneEnd(__tracy_ctx);
             }
             break;
         case GGML_OP_MAP_CUSTOM2_F32:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_MAP_CUSTOM2_F32", true);
                 ggml_custom2_op_f32_t fun;
                 memcpy(&fun, tensor->op_params, sizeof(fun));
                 ggml_compute_forward_map_custom2_f32(params, tensor->src[0], tensor->src[1], tensor, fun);
-                TracyCZoneEnd(__tracy_ctx);
             }
             break;
         case GGML_OP_MAP_CUSTOM3_F32:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_MAP_CUSTOM3_F32", true);
                 ggml_custom3_op_f32_t fun;
                 memcpy(&fun, tensor->op_params, sizeof(fun));
                 ggml_compute_forward_map_custom3_f32(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor, fun);
-                TracyCZoneEnd(__tracy_ctx);
             }
             break;
         case GGML_OP_MAP_CUSTOM1:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_MAP_CUSTOM1", true);
                 ggml_compute_forward_map_custom1(params, tensor->src[0], tensor);
-                TracyCZoneEnd(__tracy_ctx);
             }
             break;
         case GGML_OP_MAP_CUSTOM2:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_MAP_CUSTOM2", true);
                 ggml_compute_forward_map_custom2(params, tensor->src[0], tensor->src[1], tensor);
-                TracyCZoneEnd(__tracy_ctx);
             }
             break;
         case GGML_OP_MAP_CUSTOM3:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_MAP_CUSTOM3", true);
                 ggml_compute_forward_map_custom3(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
-                TracyCZoneEnd(__tracy_ctx);
             }
             break;
         case GGML_OP_CROSS_ENTROPY_LOSS:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_CROSS_ENTROPY_LOSS", true);
                 ggml_compute_forward_cross_entropy_loss(params, tensor->src[0], tensor->src[1], tensor);
-                TracyCZoneEnd(__tracy_ctx);
             }
             break;
         case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
             {
-                TracyCZoneN(__tracy_ctx, "GGML_OP_CROSS_ENTROPY_LOSS_BACK", true);
                 ggml_compute_forward_cross_entropy_loss_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
-                TracyCZoneEnd(__tracy_ctx);
             }
             break;
         case GGML_OP_NONE:
@@ -19063,17 +18908,13 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) {
 }
 
 void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
-    TracyCZoneN(_tracy_graph, "ggml_graph_plan", true);
     struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads);
-    TracyCZoneEnd(_tracy_graph);
 
     struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
 
     cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
 
-    TracyCZoneN(_tracy_compute, "ggml_graph_compute", true);
     ggml_graph_compute(cgraph, &cplan);
-    TracyCZoneEnd(_tracy_compute);
 }
 
 struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) {

+ 1 - 1
ggml/test_unity_cpp.py

@@ -841,4 +841,4 @@ def assert_hypotheses(
         g_step_scores = ggml.to_numpy(g_hyp.step_scores)
         assert g_tokens == exp["seq"]
         assert g_hyp.score == pytest.approx(exp["score"], rel=score_rtol)
-        assert np.allclose(g_step_scores, exp["step_scores"], rtol=step_scores_rtol)
+        assert np.allclose(g_step_scores, exp["step_scores"], rtol=step_scores_rtol)