Parcourir la source

sync ggml to public SC

cndn il y a 1 an
Parent
commit
2828c97c36
2 fichiers modifiés avec 11 ajouts et 11 suppressions
  1. 8 11
      ggml/examples/unity/fairseq2.cpp
  2. 3 0
      ggml/examples/unity/unity.cpp

+ 8 - 11
ggml/examples/unity/fairseq2.cpp

@@ -140,15 +140,15 @@ void append_to_prev_kv(const fairseq2_model& model, const std::string& prefix, g
 
     // qk is (B * H, Sq, Sk) == (B*H, 1, Sk) in incremental mode
     // we return the Sq slice of the (Sq, Sk) attention mask
-    if (self_attn_mask != nullptr) {
-        *self_attn_mask = ggml_slice(
-            ctx, ggml_slice(ctx, kv.self_attn_mask, 0, 0, step_nr),
-            1, step_nr - 1, step_nr
-        );
-    }
+    *self_attn_mask = ggml_slice(
+        model.ctx,
+        ggml_slice(model.ctx, kv.self_attn_mask, 0, 0, step_nr),
+        1,
+        step_nr - 1,
+        step_nr
+    );
 
     kv.step_nr = step_nr;
-    ggml_set_no_alloc(ctx, no_alloc_save);
 }
 
 // variant of ggml_get_rows that allows for a with more than 2 dims.
@@ -1481,7 +1481,7 @@ extern "C" Hypothesis* generate_sequence(
             }
         }
         ggml_tensor* prev_token = ggml_slice(step_ctx, seqs, 0, step_nr, step_nr + 1);
-
+        
         ggml_tensor* decoder_input = TransformerEmbeddingFrontend_forward(model, "text_decoder_frontend", prev_token);
         ggml_tensor* decoder_output = StandardTransformerDecoder_forward(
             model,
@@ -1565,8 +1565,6 @@ extern "C" Hypothesis* generate_sequence(
         // Reorder beams in the `seq` and `score` buffers. The same beam can
         // be selected more than once.
         // (B, S), (B) -> (B, S)
-        // don't use allocr API, cause it might reuse a kv cache buffer several time.
-        ggml_set_no_alloc(step_ctx, false);
         ggml_tensor* new_seqs = ggml_get_rows(step_ctx, seqs, beam_indices);
         ggml_tensor* new_scores = ggml_get_rows(step_ctx, scores, beam_indices);
         struct ggml_cgraph * gf_reorder = ggml_new_graph(step_ctx);
@@ -1848,7 +1846,6 @@ std::pair<std::vector<std::string>, std::vector<float>> fairseq2_spm_detokenize(
         std::size_t n = token.end() - begin;
         written += n;
         out += n;
-
     }
     if(subword_scores.size() > 0) {
         word_scores.push_back(*std::min_element(subword_scores.begin(), subword_scores.end()));

+ 3 - 0
ggml/examples/unity/unity.cpp

@@ -36,6 +36,9 @@ void unity_print_usage(int /*argc*/, char ** argv, const unity_params & params)
     fprintf(stderr, "\n");
     fprintf(stderr, "options:\n");
     fprintf(stderr, "  -h, --help            show this help message and exit\n");
+    fprintf(stderr, "  -i, --input           Input text for the text-2-text translation\n");
+    fprintf(stderr, "  -l, --tgt-lang        Target translation lang (default: %s\n", params.tgt_lang);
+
     fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
     fprintf(stderr, "  -v, --verbose         Print out word level confidence score and LID score (default: off)");
     fprintf(stderr, "  -m FNAME, --model FNAME\n");