fairseq2.h 8.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304
  1. #pragma once
  2. #include <unordered_map>
  3. #include <string>
  4. #include <vector>
  5. #include "ggml.h"
  6. #include "kaldi-native-fbank/csrc/feature-fbank.h"
  7. typedef int32_t llama_token;
  8. extern "C" enum llama_token_type {
  9. LLAMA_TOKEN_TYPE_UNDEFINED = 0,
  10. LLAMA_TOKEN_TYPE_NORMAL = 1,
  11. LLAMA_TOKEN_TYPE_UNKNOWN = 2,
  12. LLAMA_TOKEN_TYPE_CONTROL = 3,
  13. LLAMA_TOKEN_TYPE_USER_DEFINED = 4,
  14. LLAMA_TOKEN_TYPE_UNUSED = 5,
  15. LLAMA_TOKEN_TYPE_BYTE = 6,
  16. };
  17. struct llama_vocab {
  18. using id = int32_t;
  19. using token = std::string;
  20. using ttype = llama_token_type;
  21. struct token_data {
  22. token text;
  23. float score;
  24. ttype type;
  25. };
  26. std::unordered_map<token, id> token_to_id;
  27. std::vector<token_data> id_to_token;
  28. std::unordered_map<token, id> special_tokens_cache;
  29. std::map<std::pair<std::string, std::string>, int> bpe_ranks;
  30. // default LLaMA special tokens
  31. id special_bos_id = 1;
  32. id special_eos_id = 2;
  33. id special_unk_id = 0;
  34. id special_sep_id = -1;
  35. id special_pad_id = -1;
  36. int special_add_bos = -1; // -1 unknown, 1 add, 0 don't add.
  37. int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add.
  38. id linefeed_id = 13;
  39. id special_prefix_id = 32007;
  40. id special_middle_id = 32009;
  41. id special_suffix_id = 32008;
  42. id special_eot_id = 32010;
  43. int find_bpe_rank(std::string token_left, std::string token_right) const {
  44. GGML_ASSERT(token_left.find(" ") == std::string::npos);
  45. GGML_ASSERT(token_left.find("\n") == std::string::npos);
  46. GGML_ASSERT(token_right.find(" ") == std::string::npos);
  47. GGML_ASSERT(token_right.find("\n") == std::string::npos);
  48. auto it = bpe_ranks.find(std::make_pair(token_left, token_right));
  49. if (it == bpe_ranks.end()) {
  50. return -1;
  51. }
  52. return it->second;
  53. }
  54. };
  55. struct KeyValueTensor {
  56. ggml_tensor* full_k;
  57. ggml_tensor* full_v;
  58. ggml_tensor* self_attn_mask;
  59. int step_nr;
  60. };
  61. struct fairseq2_model {
  62. // Context containing all tensors memory
  63. ggml_context* tensors_ctx;
  64. // Named tensors, all tensors should belong to tensors_ctx
  65. std::unordered_map<std::string, struct ggml_tensor *> tensors;
  66. // Hashmap containing model hyper-parameters.
  67. std::unordered_map<std::string, std::int64_t> hparams;
  68. // Hashmap containing layers hyper-parameters.
  69. // Normally those can be inferred from hparams, but it avoids doing this logic in GGML
  70. std::unordered_map<std::string, std::int64_t> layer_config;
  71. llama_vocab vocab;
  72. // KV cache for attention layers
  73. mutable std::unordered_map<std::string, KeyValueTensor> kv_cache;
  74. // an inference context, not managed by this object
  75. // TODO: is this the best place to store this or should we also pass this to all forward methods ?
  76. ggml_context* ctx;
  77. };
  78. double fairseq2_model_layer_config_double(const fairseq2_model& model, std::string name);
  79. /// allocate the fairseq2 model and hyperparameters
  80. extern "C" fairseq2_model* fairseq2_model_alloc();
  81. // free the models and all its owned tensors
  82. extern "C" void fairseq2_model_free(fairseq2_model* model);
  83. extern "C" void fairseq2_model_set_inference_ctx(fairseq2_model* model, ggml_context* ctx);
  84. extern "C" void fairseq2_kv_cache_reset(const fairseq2_model& model);
  85. ggml_context* ctx_from_buffer(std::vector<uint8_t>& buffer);
  86. extern "C" std::string* std_string_alloc(char* c_str);
  87. extern "C" void std_string_free(std::string* str);
  88. extern "C" ggml_tensor* WaveformToFbank_forward(
  89. fairseq2_model& model,
  90. const std::string &prefix,
  91. ggml_tensor* waveform
  92. );
  93. extern "C" ggml_tensor* ggml_slice(
  94. struct ggml_context* ctx,
  95. struct ggml_tensor* a,
  96. int axis,
  97. int64_t start,
  98. int64_t end
  99. );
  100. /// Merge the given dimension and the previous one in the tensor.
  101. /// (..., num_heads, N, ...) -> (..., num_heads * N, ...)
  102. /// dim is the position of the resulting merged dimension
  103. /// ggml_flatten_1d(x, d) <==> torch.flatten(x, -1-d-1, -1-d0
  104. extern "C" ggml_tensor* ggml_flatten_1d(ggml_context* ctx, ggml_tensor* x, int dim);
  105. /// Split the given dimension.
  106. /// (..., K * N, ...) -> (..., K, N, ...)
  107. /// dim is the position of the output dimension with the given number of element (N).
  108. extern "C" ggml_tensor* ggml_unflatten_1d(ggml_context* ctx, ggml_tensor* x, int dim, int num_el);
  109. extern "C" ggml_tensor* Linear_forward(
  110. fairseq2_model& model,
  111. const std::string &prefix,
  112. ggml_tensor* input
  113. );
  114. extern "C" ggml_tensor* LayerNorm_forward(
  115. fairseq2_model& model,
  116. const std::string &prefix,
  117. ggml_tensor* input
  118. );
  119. extern "C" ggml_tensor* StandardFeedForwardNetwork_forward(
  120. fairseq2_model& model,
  121. const std::string& prefix,
  122. ggml_tensor* seqs
  123. );
  124. extern "C" ggml_tensor* SiluFeedForwardNetwork_forward(
  125. fairseq2_model& model,
  126. const std::string& prefix,
  127. ggml_tensor* seqs
  128. );
  129. extern "C" ggml_tensor* MultiheadAttention_forward(
  130. fairseq2_model& model,
  131. const std::string &prefix,
  132. ggml_tensor* queries, // (slen, d_in)
  133. ggml_tensor* keys, // (klen, d_in)
  134. ggml_tensor* values, // (klen, d_out)
  135. ggml_tensor* attn_mask // (klen, slen)
  136. );
  137. extern "C" ggml_tensor* PositionalEmbedding_forward(
  138. fairseq2_model& model,
  139. const std::string& prefix,
  140. ggml_tensor* embeds
  141. );
  142. extern "C" ggml_tensor* TransformerEmbeddingFrontend_forward(
  143. fairseq2_model& model,
  144. const std::string& prefix,
  145. ggml_tensor* seqs
  146. );
  147. extern "C" ggml_tensor* StandardTransformerEncoderLayer_forward(
  148. fairseq2_model& model,
  149. const std::string& prefix,
  150. ggml_tensor* seqs,
  151. ggml_tensor* padding_mask
  152. );
  153. extern "C" ggml_tensor* RelativePositionMHA_forward(
  154. fairseq2_model& model,
  155. const std::string& prefix,
  156. ggml_tensor* seqs
  157. );
  158. extern "C" ggml_tensor* ConvModule_forward(
  159. fairseq2_model& model,
  160. const std::string& prefix,
  161. ggml_tensor* seqs
  162. );
  163. extern "C" ggml_tensor* StandardConformerEncoderLayer_forward(
  164. fairseq2_model& model,
  165. const std::string& prefix,
  166. ggml_tensor* seqs,
  167. ggml_tensor* padding_mask
  168. );
  169. extern "C" ggml_tensor* StandardConformerEncoder_forward(
  170. fairseq2_model& model,
  171. const std::string& prefix,
  172. ggml_tensor* seqs,
  173. ggml_tensor* padding_mask
  174. );
  175. extern "C" ggml_tensor* StandardConformerEncoderAdaptorLayer_forward(
  176. fairseq2_model& model,
  177. const std::string& prefix,
  178. ggml_tensor* seqs,
  179. ggml_tensor* padding_mask
  180. );
  181. extern "C" ggml_tensor* StandardConformerEncoderAdaptor_forward(
  182. fairseq2_model& model,
  183. const std::string& prefix,
  184. ggml_tensor* seqs,
  185. ggml_tensor* padding_mask
  186. );
  187. // Specifies the Layer Normalization order.
  188. // see fairseq2/nn/transformer/norm_order.py
  189. enum TransformerNormOrder {
  190. TRANSFORMER_NORM_ORDER_POST = 0,
  191. TRANSFORMER_NORM_ORDER_PRE = 1,
  192. TRANSFORMER_NORM_ORDER_PRE_WITH_NORMFORMER = 2
  193. };
  194. /// Holds the options to pass to a sequence generator.
  195. struct SequenceGeneratorOptions {
  196. /// The beam size.
  197. int beam_size = 5;
  198. /// The minimum length of generated sequences (including prefix sequence).
  199. int min_seq_len = 1;
  200. /// The terms ``a`` and ``b`` of ``ax + b`` where ``x`` is the source
  201. /// sequence length. The generated sequences (including prefix sequence) will
  202. /// have the maximum length of ``min(hard_max_seq_len, ax + b)``. See also
  203. /// ``hard_max_seq_len``.
  204. float soft_max_seq_len_a = 1;
  205. int soft_max_seq_len_b = 200;
  206. /// The hard limit on maximum length of generated sequences.
  207. int hard_max_seq_len = 1024;
  208. /// The length penalty, where values less than 1.0 favor shorter, values
  209. /// greater than 1.0 favor longer sequences.
  210. float len_penalty = 1.0;
  211. /// The unknown symbol penalty, where values less than 0 produce more UNKs,
  212. /// values greater than 0 produce fewer UNKs.
  213. float unk_penalty = 0.0;
  214. /// If ``True``, normalizes scores by the length of generated sequences.
  215. bool normalize_scores = true;
  216. };
  217. struct SequenceGeneratorJob {
  218. SequenceGeneratorOptions opts;
  219. ggml_tensor* prefix_seq;
  220. std::int32_t pad_idx;
  221. std::int32_t unk_idx;
  222. std::int32_t bos_idx;
  223. std::int32_t eos_idx;
  224. std::int32_t num_threads;
  225. };
  226. /// Represents a hypothesis produced by a sequence generator.
  227. struct Hypothesis {
  228. /// The generated sequence.
  229. ggml_tensor* seq;
  230. /// The score of the hypothesis.
  231. float score;
  232. /// The score of each individual sequence step.
  233. ggml_tensor* step_scores;
  234. };
  235. extern "C" Hypothesis* generate_sequence(
  236. fairseq2_model& model,
  237. const SequenceGeneratorJob& opts,
  238. ggml_tensor* encoder_output,
  239. ggml_tensor* encoder_padding_mask,
  240. ggml_context* result_ctx
  241. );
  242. extern "C" void fairseq2_spm_tokenize(fairseq2_model* model, const char* text, ggml_tensor& out);
  243. extern "C" std::size_t fairseq2_spm_detokenize(fairseq2_model* model, ggml_tensor* tokens, char* out);