fairseq2.h 6.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235
  1. #pragma once
  2. #include <unordered_map>
  3. #include <string>
  4. #include <vector>
  5. #include "ggml.h"
  6. #include "kaldi-native-fbank/csrc/feature-fbank.h"
  7. struct KeyValueTensor {
  8. ggml_tensor* full_k;
  9. ggml_tensor* full_v;
  10. ggml_tensor* self_attn_mask;
  11. int step_nr;
  12. };
  13. struct fairseq2_model {
  14. // Context containing all tensors memory
  15. ggml_context* tensors_ctx;
  16. // Named tensors, all tensors should belong to tensors_ctx
  17. std::unordered_map<std::string, struct ggml_tensor *> tensors;
  18. // Hashmap containing model hyper-parameters.
  19. std::unordered_map<std::string, std::int64_t> hparams;
  20. // Hashmap containing layers hyper-parameters.
  21. // Normally those can be inferred from hparams, but it avoids doing this logic in GGML
  22. std::unordered_map<std::string, std::int64_t> layer_config;
  23. // KV cache for attention layers
  24. mutable std::unordered_map<std::string, KeyValueTensor> kv_cache;
  25. // an inference context, not managed by this object
  26. // TODO: is this the best place to store this or should we also pass this to all forward methods ?
  27. ggml_context* ctx;
  28. };
  29. double fairseq2_model_layer_config_double(const fairseq2_model& model, std::string name);
  30. /// allocate the fairseq2 model and hyperparameters
  31. extern "C" fairseq2_model* fairseq2_model_alloc();
  32. // free the models and all its owned tensors
  33. extern "C" void fairseq2_model_free(fairseq2_model* model);
  34. extern "C" void fairseq2_model_set_inference_ctx(fairseq2_model* model, ggml_context* ctx);
  35. extern "C" std::string* std_string_alloc(char* c_str);
  36. extern "C" void std_string_free(std::string* str);
  37. extern "C" ggml_tensor* WaveformToFbank_forward(
  38. fairseq2_model& model,
  39. const std::string &prefix,
  40. ggml_tensor* waveform
  41. );
  42. extern "C" ggml_tensor* ggml_slice(
  43. struct ggml_context* ctx,
  44. struct ggml_tensor* a,
  45. int axis,
  46. int64_t start,
  47. int64_t end
  48. );
  49. /// Merge the given dimension and the previous one in the tensor.
  50. /// (..., num_heads, N, ...) -> (..., num_heads * N, ...)
  51. /// dim is the position of the resulting merged dimension
  52. /// ggml_flatten_1d(x, d) <==> torch.flatten(x, -1-d-1, -1-d0
  53. extern "C" ggml_tensor* ggml_flatten_1d(ggml_context* ctx, ggml_tensor* x, int dim);
  54. /// Split the given dimension.
  55. /// (..., K * N, ...) -> (..., K, N, ...)
  56. /// dim is the position of the output dimension with the given number of element (N).
  57. extern "C" ggml_tensor* ggml_unflatten_1d(ggml_context* ctx, ggml_tensor* x, int dim, int num_el);
  58. extern "C" ggml_tensor* Linear_forward(
  59. fairseq2_model& model,
  60. const std::string &prefix,
  61. ggml_tensor* input
  62. );
  63. extern "C" ggml_tensor* LayerNorm_forward(
  64. fairseq2_model& model,
  65. const std::string &prefix,
  66. ggml_tensor* input
  67. );
  68. extern "C" ggml_tensor* StandardFeedForwardNetwork_forward(
  69. fairseq2_model& model,
  70. const std::string& prefix,
  71. ggml_tensor* seqs
  72. );
  73. extern "C" ggml_tensor* SiluFeedForwardNetwork_forward(
  74. fairseq2_model& model,
  75. const std::string& prefix,
  76. ggml_tensor* seqs
  77. );
  78. extern "C" ggml_tensor* MultiheadAttention_forward(
  79. fairseq2_model& model,
  80. const std::string &prefix,
  81. ggml_tensor* queries, // (slen, d_in)
  82. ggml_tensor* keys, // (klen, d_in)
  83. ggml_tensor* values, // (klen, d_out)
  84. ggml_tensor* attn_mask // (klen, slen)
  85. );
  86. extern "C" ggml_tensor* PositionalEmbedding_forward(
  87. fairseq2_model& model,
  88. const std::string& prefix,
  89. ggml_tensor* embeds
  90. );
  91. extern "C" ggml_tensor* TransformerEmbeddingFrontend_forward(
  92. fairseq2_model& model,
  93. const std::string& prefix,
  94. ggml_tensor* seqs
  95. );
  96. extern "C" ggml_tensor* StandardTransformerEncoderLayer_forward(
  97. fairseq2_model& model,
  98. const std::string& prefix,
  99. ggml_tensor* seqs,
  100. ggml_tensor* padding_mask
  101. );
  102. extern "C" ggml_tensor* RelativePositionMHA_forward(
  103. fairseq2_model& model,
  104. const std::string& prefix,
  105. ggml_tensor* seqs
  106. );
  107. extern "C" ggml_tensor* ConvModule_forward(
  108. fairseq2_model& model,
  109. const std::string& prefix,
  110. ggml_tensor* seqs
  111. );
  112. extern "C" ggml_tensor* StandardConformerEncoderLayer_forward(
  113. fairseq2_model& model,
  114. const std::string& prefix,
  115. ggml_tensor* seqs,
  116. ggml_tensor* padding_mask
  117. );
  118. extern "C" ggml_tensor* StandardConformerEncoder_forward(
  119. fairseq2_model& model,
  120. const std::string& prefix,
  121. ggml_tensor* seqs,
  122. ggml_tensor* padding_mask
  123. );
  124. extern "C" ggml_tensor* StandardConformerEncoderAdaptorLayer_forward(
  125. fairseq2_model& model,
  126. const std::string& prefix,
  127. ggml_tensor* seqs,
  128. ggml_tensor* padding_mask
  129. );
  130. extern "C" ggml_tensor* StandardConformerEncoderAdaptor_forward(
  131. fairseq2_model& model,
  132. const std::string& prefix,
  133. ggml_tensor* seqs,
  134. ggml_tensor* padding_mask
  135. );
  136. // Specifies the Layer Normalization order.
  137. // see fairseq2/nn/transformer/norm_order.py
  138. enum TransformerNormOrder {
  139. TRANSFORMER_NORM_ORDER_POST = 0,
  140. TRANSFORMER_NORM_ORDER_PRE = 1,
  141. TRANSFORMER_NORM_ORDER_PRE_WITH_NORMFORMER = 2
  142. };
  143. /// Holds the options to pass to a sequence generator.
  144. struct SequenceGeneratorOptions {
  145. /// The beam size.
  146. int beam_size = 5;
  147. /// The minimum length of generated sequences (including prefix sequence).
  148. int min_seq_len = 1;
  149. /// The terms ``a`` and ``b`` of ``ax + b`` where ``x`` is the source
  150. /// sequence length. The generated sequences (including prefix sequence) will
  151. /// have the maximum length of ``min(hard_max_seq_len, ax + b)``. See also
  152. /// ``hard_max_seq_len``.
  153. float soft_max_seq_len_a = 1;
  154. int soft_max_seq_len_b = 200;
  155. /// The hard limit on maximum length of generated sequences.
  156. int hard_max_seq_len = 1024;
  157. /// The length penalty, where values less than 1.0 favor shorter, values
  158. /// greater than 1.0 favor longer sequences.
  159. float len_penalty = 1.0;
  160. /// The unknown symbol penalty, where values less than 0 produce more UNKs,
  161. /// values greater than 0 produce fewer UNKs.
  162. float unk_penalty = 0.0;
  163. /// If ``True``, normalizes scores by the length of generated sequences.
  164. bool normalize_scores = true;
  165. };
  166. struct SequenceGeneratorJob {
  167. SequenceGeneratorOptions opts;
  168. ggml_tensor* prefix_seq;
  169. std::int32_t pad_idx;
  170. std::int32_t unk_idx;
  171. std::int32_t bos_idx;
  172. std::int32_t eos_idx;
  173. std::int32_t num_threads;
  174. };
  175. /// Represents a hypothesis produced by a sequence generator.
  176. struct Hypothesis {
  177. /// The generated sequence.
  178. ggml_tensor* seq;
  179. /// The score of the hypothesis.
  180. float score;
  181. /// The score of each individual sequence step.
  182. ggml_tensor* step_scores;
  183. };
  184. extern "C" Hypothesis* generate_sequence(
  185. fairseq2_model& model,
  186. const SequenceGeneratorJob& opts,
  187. ggml_tensor* encoder_output,
  188. ggml_tensor* encoder_padding_mask,
  189. ggml_context* result_ctx
  190. );