fairseq2.h 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331
  1. #pragma once
  2. #include <unordered_map>
  3. #include <string>
  4. #include <vector>
  5. #include "ggml.h"
  6. #include "kaldi-native-fbank/csrc/feature-fbank.h"
  7. #include "ggml-alloc.h"
  8. #define FORCE_ALLOC(name, ctx, ggml_new_tensor)\
  9. bool name ## _save_no_alloc_ = ggml_get_no_alloc(ctx); \
  10. ggml_set_no_alloc(ctx, false); \
  11. ggml_tensor* name = ggml_new_tensor; \
  12. ggml_set_no_alloc(ctx, name ## _save_no_alloc_);
  13. typedef int32_t llama_token;
  14. extern "C" enum llama_token_type {
  15. LLAMA_TOKEN_TYPE_UNDEFINED = 0,
  16. LLAMA_TOKEN_TYPE_NORMAL = 1,
  17. LLAMA_TOKEN_TYPE_UNKNOWN = 2,
  18. LLAMA_TOKEN_TYPE_CONTROL = 3,
  19. LLAMA_TOKEN_TYPE_USER_DEFINED = 4,
  20. LLAMA_TOKEN_TYPE_UNUSED = 5,
  21. LLAMA_TOKEN_TYPE_BYTE = 6,
  22. };
  23. struct llama_vocab {
  24. using id = int32_t;
  25. using token = std::string;
  26. using ttype = llama_token_type;
  27. struct token_data {
  28. token text;
  29. float score;
  30. ttype type;
  31. };
  32. std::unordered_map<token, id> token_to_id;
  33. std::vector<token_data> id_to_token;
  34. std::unordered_map<token, id> special_tokens_cache;
  35. std::map<std::pair<std::string, std::string>, int> bpe_ranks;
  36. // default LLaMA special tokens
  37. id special_bos_id = 1;
  38. id special_eos_id = 2;
  39. id special_unk_id = 0;
  40. id special_sep_id = -1;
  41. id special_pad_id = -1;
  42. int special_add_bos = -1; // -1 unknown, 1 add, 0 don't add.
  43. int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add.
  44. id linefeed_id = 13;
  45. id special_prefix_id = 32007;
  46. id special_middle_id = 32009;
  47. id special_suffix_id = 32008;
  48. id special_eot_id = 32010;
  49. int find_bpe_rank(std::string token_left, std::string token_right) const {
  50. GGML_ASSERT(token_left.find(" ") == std::string::npos);
  51. GGML_ASSERT(token_left.find("\n") == std::string::npos);
  52. GGML_ASSERT(token_right.find(" ") == std::string::npos);
  53. GGML_ASSERT(token_right.find("\n") == std::string::npos);
  54. auto it = bpe_ranks.find(std::make_pair(token_left, token_right));
  55. if (it == bpe_ranks.end()) {
  56. return -1;
  57. }
  58. return it->second;
  59. }
  60. };
  61. struct KeyValueTensor {
  62. ggml_tensor* full_k;
  63. ggml_tensor* full_v;
  64. ggml_tensor* self_attn_mask;
  65. int step_nr;
  66. };
  67. struct fairseq2_model {
  68. // Context containing all tensors memory
  69. ggml_context* tensors_ctx = nullptr;
  70. // Named tensors, all tensors should belong to tensors_ctx
  71. std::unordered_map<std::string, struct ggml_tensor *> tensors = {};
  72. // Hashmap containing model hyper-parameters.
  73. std::unordered_map<std::string, std::int64_t> hparams = {};
  74. // Hashmap containing layers hyper-parameters.
  75. // Normally those can be inferred from hparams, but it avoids doing this logic in GGML
  76. std::unordered_map<std::string, std::int64_t> layer_config = {};
  77. llama_vocab vocab;
  78. // KV cache for attention layers
  79. mutable std::unordered_map<std::string, KeyValueTensor> kv_cache = {};
  80. // an inference context, not managed by this object
  81. // TODO: is this the best place to store this or should we also pass this to all forward methods ?
  82. ggml_context* ctx = nullptr;
  83. ggml_context* kv_cache_ctx = nullptr;
  84. };
  85. double fairseq2_model_layer_config_double(const fairseq2_model& model, std::string name);
  86. /// allocate the fairseq2 model and hyperparameters
  87. extern "C" fairseq2_model* fairseq2_model_alloc();
  88. // free the models and all its owned tensors
  89. extern "C" void fairseq2_model_free(fairseq2_model* model);
  90. extern "C" void fairseq2_model_set_inference_ctx(fairseq2_model* model, ggml_context* ctx);
  91. extern "C" void fairseq2_kv_cache_reset(const fairseq2_model& model);
  92. ggml_context* ctx_from_buffer(std::vector<uint8_t>& buffer);
  93. extern "C" std::string* std_string_alloc(char* c_str);
  94. extern "C" void std_string_free(std::string* str);
  95. extern "C" ggml_tensor* WaveformToFbank_forward(
  96. fairseq2_model& model,
  97. const std::string &prefix,
  98. ggml_tensor* waveform
  99. );
  100. extern "C" ggml_tensor* ggml_slice(
  101. struct ggml_context* ctx,
  102. struct ggml_tensor* a,
  103. int axis,
  104. int64_t start,
  105. int64_t end
  106. );
  107. /// Merge the given dimension and the previous one in the tensor.
  108. /// (..., num_heads, N, ...) -> (..., num_heads * N, ...)
  109. /// dim is the position of the resulting merged dimension
  110. /// ggml_flatten_1d(x, d) <==> torch.flatten(x, -1-d-1, -1-d0
  111. extern "C" ggml_tensor* ggml_flatten_1d(ggml_context* ctx, ggml_tensor* x, int dim);
  112. /// Split the given dimension.
  113. /// (..., K * N, ...) -> (..., K, N, ...)
  114. /// dim is the position of the output dimension with the given number of element (N).
  115. extern "C" ggml_tensor* ggml_unflatten_1d(ggml_context* ctx, ggml_tensor* x, int dim, int num_el);
  116. extern "C" ggml_tensor* Linear_forward(
  117. fairseq2_model& model,
  118. const std::string &prefix,
  119. ggml_tensor* input
  120. );
  121. extern "C" ggml_tensor* LayerNorm_forward(
  122. fairseq2_model& model,
  123. const std::string &prefix,
  124. ggml_tensor* input
  125. );
  126. extern "C" ggml_tensor* StandardFeedForwardNetwork_forward(
  127. fairseq2_model& model,
  128. const std::string& prefix,
  129. ggml_tensor* seqs
  130. );
  131. extern "C" ggml_tensor* SiluFeedForwardNetwork_forward(
  132. fairseq2_model& model,
  133. const std::string& prefix,
  134. ggml_tensor* seqs
  135. );
  136. extern "C" ggml_tensor* MultiheadAttention_forward(
  137. fairseq2_model& model,
  138. const std::string &prefix,
  139. ggml_tensor* queries, // (slen, d_in)
  140. ggml_tensor* keys, // (klen, d_in)
  141. ggml_tensor* values, // (klen, d_out)
  142. ggml_tensor* attn_mask // (klen, slen)
  143. );
  144. extern "C" ggml_tensor* PositionalEmbedding_forward(
  145. fairseq2_model& model,
  146. const std::string& prefix,
  147. ggml_tensor* embeds
  148. );
  149. extern "C" ggml_tensor* TransformerEmbeddingFrontend_forward(
  150. fairseq2_model& model,
  151. const std::string& prefix,
  152. ggml_tensor* seqs
  153. );
  154. extern "C" ggml_tensor* StandardTransformerEncoderLayer_forward(
  155. fairseq2_model& model,
  156. const std::string& prefix,
  157. ggml_tensor* seqs,
  158. ggml_tensor* padding_mask
  159. );
  160. extern "C" ggml_tensor* StandardTransformerEncoder_forward(
  161. fairseq2_model& model,
  162. const std::string& prefix,
  163. ggml_tensor* seqs,
  164. ggml_tensor* padding_mask
  165. );
  166. extern "C" ggml_tensor* RelativePositionMHA_forward(
  167. fairseq2_model& model,
  168. const std::string& prefix,
  169. ggml_tensor* seqs
  170. );
  171. extern "C" ggml_tensor* ConvModule_forward(
  172. fairseq2_model& model,
  173. const std::string& prefix,
  174. ggml_tensor* seqs
  175. );
  176. extern "C" ggml_tensor* StandardConformerEncoderLayer_forward(
  177. fairseq2_model& model,
  178. const std::string& prefix,
  179. ggml_tensor* seqs,
  180. ggml_tensor* padding_mask
  181. );
  182. extern "C" ggml_tensor* StandardConformerEncoder_forward(
  183. fairseq2_model& model,
  184. const std::string& prefix,
  185. ggml_tensor* seqs,
  186. ggml_tensor* padding_mask
  187. );
  188. extern "C" ggml_tensor* StandardConformerEncoderAdaptorLayer_forward(
  189. fairseq2_model& model,
  190. const std::string& prefix,
  191. ggml_tensor* seqs,
  192. ggml_tensor* padding_mask
  193. );
  194. extern "C" ggml_tensor* StandardConformerEncoderAdaptor_forward(
  195. fairseq2_model& model,
  196. const std::string& prefix,
  197. ggml_tensor* seqs,
  198. ggml_tensor* padding_mask
  199. );
  200. // Specifies the Layer Normalization order.
  201. // see fairseq2/nn/transformer/norm_order.py
  202. enum TransformerNormOrder {
  203. TRANSFORMER_NORM_ORDER_POST = 0,
  204. TRANSFORMER_NORM_ORDER_PRE = 1,
  205. TRANSFORMER_NORM_ORDER_PRE_WITH_NORMFORMER = 2
  206. };
  207. /// Holds the options to pass to a sequence generator.
  208. struct SequenceGeneratorOptions {
  209. /// The beam size.
  210. int beam_size = 5;
  211. /// The minimum length of generated sequences (including prefix sequence).
  212. int min_seq_len = 1;
  213. /// The terms ``a`` and ``b`` of ``ax + b`` where ``x`` is the source
  214. /// sequence length. The generated sequences (including prefix sequence) will
  215. /// have the maximum length of ``min(hard_max_seq_len, ax + b)``. See also
  216. /// ``hard_max_seq_len``.
  217. float soft_max_seq_len_a = 1;
  218. int soft_max_seq_len_b = 200;
  219. /// The hard limit on maximum length of generated sequences.
  220. int hard_max_seq_len = 1024;
  221. /// The length penalty, where values less than 1.0 favor shorter, values
  222. /// greater than 1.0 favor longer sequences.
  223. float len_penalty = 1.0;
  224. /// The unknown symbol penalty, where values less than 0 produce more UNKs,
  225. /// values greater than 0 produce fewer UNKs.
  226. float unk_penalty = 0.0;
  227. /// If ``True``, normalizes scores by the length of generated sequences.
  228. bool normalize_scores = true;
  229. // memory needed is largely a fn of model size + sentence length and beam_size
  230. int mem_mb = 256;
  231. };
  232. struct SequenceGeneratorJob {
  233. SequenceGeneratorOptions opts;
  234. ggml_tensor* prefix_seq;
  235. std::int32_t pad_idx;
  236. std::int32_t unk_idx;
  237. std::int32_t bos_idx;
  238. std::int32_t eos_idx;
  239. std::int32_t num_threads;
  240. };
  241. /// Represents a hypothesis produced by a sequence generator.
  242. struct Hypothesis {
  243. /// The generated sequence.
  244. ggml_tensor* seq;
  245. /// The score of the hypothesis.
  246. float score;
  247. /// The score of each individual sequence step.
  248. ggml_tensor* step_scores;
  249. /// The score of each lang tok at first decoding step, serving as LID
  250. ggml_tensor* lid_scores;
  251. };
  252. extern "C" Hypothesis* generate_sequence(
  253. fairseq2_model& model,
  254. const SequenceGeneratorJob& opts,
  255. ggml_tensor* encoder_output,
  256. ggml_tensor* encoder_padding_mask,
  257. ggml_context* result_ctx,
  258. int threads
  259. );
  260. extern "C" void fairseq2_spm_tokenize(fairseq2_model* model, const char* text, ggml_tensor* out);
  261. extern "C" std::size_t fairseq2_spm_detokenize(fairseq2_model* model, ggml_tensor* tokens, char* out);
  262. std::pair<std::vector<std::string>, std::vector<float>> fairseq2_spm_detokenize(fairseq2_model* model, ggml_tensor* tokens, ggml_tensor* scores, char* out);