fairseq2.h 9.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335
  1. #pragma once
  2. #include <unordered_map>
  3. #include <string>
  4. #include <vector>
  5. #include "ggml.h"
  6. #include "kaldi-native-fbank/csrc/feature-fbank.h"
  7. #include "ggml-alloc.h"
  8. #define FORCE_ALLOC(name, ctx, ggml_new_tensor)\
  9. bool name ## _save_no_alloc_ = ggml_get_no_alloc(ctx); \
  10. ggml_set_no_alloc(ctx, false); \
  11. ggml_tensor* name = ggml_new_tensor; \
  12. ggml_set_no_alloc(ctx, name ## _save_no_alloc_);
  13. typedef int32_t llama_token;
  14. extern "C" enum llama_token_type {
  15. LLAMA_TOKEN_TYPE_UNDEFINED = 0,
  16. LLAMA_TOKEN_TYPE_NORMAL = 1,
  17. LLAMA_TOKEN_TYPE_UNKNOWN = 2,
  18. LLAMA_TOKEN_TYPE_CONTROL = 3,
  19. LLAMA_TOKEN_TYPE_USER_DEFINED = 4,
  20. LLAMA_TOKEN_TYPE_UNUSED = 5,
  21. LLAMA_TOKEN_TYPE_BYTE = 6,
  22. };
  23. struct llama_vocab {
  24. using id = int32_t;
  25. using token = std::string;
  26. using ttype = llama_token_type;
  27. struct token_data {
  28. token text;
  29. float score;
  30. ttype type;
  31. };
  32. std::unordered_map<token, id> token_to_id;
  33. std::vector<token_data> id_to_token;
  34. std::unordered_map<token, id> special_tokens_cache;
  35. std::map<std::pair<std::string, std::string>, int> bpe_ranks;
  36. // default LLaMA special tokens
  37. id special_bos_id = 1;
  38. id special_eos_id = 2;
  39. id special_unk_id = 0;
  40. id special_sep_id = -1;
  41. id special_pad_id = -1;
  42. int special_add_bos = -1; // -1 unknown, 1 add, 0 don't add.
  43. int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add.
  44. id linefeed_id = 13;
  45. id special_prefix_id = 32007;
  46. id special_middle_id = 32009;
  47. id special_suffix_id = 32008;
  48. id special_eot_id = 32010;
  49. int find_bpe_rank(std::string token_left, std::string token_right) const {
  50. GGML_ASSERT(token_left.find(" ") == std::string::npos);
  51. GGML_ASSERT(token_left.find("\n") == std::string::npos);
  52. GGML_ASSERT(token_right.find(" ") == std::string::npos);
  53. GGML_ASSERT(token_right.find("\n") == std::string::npos);
  54. auto it = bpe_ranks.find(std::make_pair(token_left, token_right));
  55. if (it == bpe_ranks.end()) {
  56. return -1;
  57. }
  58. return it->second;
  59. }
  60. };
  61. struct KeyValueTensor {
  62. ggml_tensor* full_k;
  63. ggml_tensor* full_v;
  64. ggml_tensor* self_attn_mask;
  65. int step_nr;
  66. };
  67. struct fairseq2_model {
  68. // Context containing all tensors memory
  69. ggml_context* tensors_ctx = nullptr;
  70. // Named tensors, all tensors should belong to tensors_ctx
  71. std::unordered_map<std::string, struct ggml_tensor *> tensors = {};
  72. // Hashmap containing model hyper-parameters.
  73. std::unordered_map<std::string, std::int64_t> hparams = {};
  74. // Hashmap containing layers hyper-parameters.
  75. // Normally those can be inferred from hparams, but it avoids doing this logic in GGML
  76. std::unordered_map<std::string, std::int64_t> layer_config = {};
  77. // Vocabulary for text transcription and translation APIs
  78. llama_vocab vocab;
  79. // Optional target vocabulary for bilingual models
  80. llama_vocab tgt_vocab;
  81. // KV cache for attention layers
  82. mutable std::unordered_map<std::string, KeyValueTensor> kv_cache = {};
  83. // an inference context, not managed by this object
  84. // TODO: is this the best place to store this or should we also pass this to all forward methods ?
  85. ggml_context* ctx = nullptr;
  86. ggml_context* kv_cache_ctx = nullptr;
  87. };
  88. double fairseq2_model_layer_config_double(const fairseq2_model& model, std::string name);
  89. /// allocate the fairseq2 model and hyperparameters
  90. extern "C" fairseq2_model* fairseq2_model_alloc();
  91. // free the models and all its owned tensors
  92. extern "C" void fairseq2_model_free(fairseq2_model* model);
  93. extern "C" void fairseq2_model_set_inference_ctx(fairseq2_model* model, ggml_context* ctx);
  94. extern "C" void fairseq2_kv_cache_reset(const fairseq2_model& model);
  95. ggml_context* ctx_from_buffer(std::vector<uint8_t>& buffer);
  96. extern "C" std::string* std_string_alloc(char* c_str);
  97. extern "C" void std_string_free(std::string* str);
  98. extern "C" ggml_tensor* WaveformToFbank_forward(
  99. fairseq2_model& model,
  100. const std::string &prefix,
  101. ggml_tensor* waveform
  102. );
  103. extern "C" ggml_tensor* ggml_slice(
  104. struct ggml_context* ctx,
  105. struct ggml_tensor* a,
  106. int axis,
  107. int64_t start,
  108. int64_t end
  109. );
  110. /// Merge the given dimension and the previous one in the tensor.
  111. /// (..., num_heads, N, ...) -> (..., num_heads * N, ...)
  112. /// dim is the position of the resulting merged dimension
  113. /// ggml_flatten_1d(x, d) <==> torch.flatten(x, -1-d-1, -1-d0
  114. extern "C" ggml_tensor* ggml_flatten_1d(ggml_context* ctx, ggml_tensor* x, int dim);
  115. /// Split the given dimension.
  116. /// (..., K * N, ...) -> (..., K, N, ...)
  117. /// dim is the position of the output dimension with the given number of element (N).
  118. extern "C" ggml_tensor* ggml_unflatten_1d(ggml_context* ctx, ggml_tensor* x, int dim, int num_el);
  119. extern "C" ggml_tensor* Linear_forward(
  120. fairseq2_model& model,
  121. const std::string &prefix,
  122. ggml_tensor* input
  123. );
  124. extern "C" ggml_tensor* LayerNorm_forward(
  125. fairseq2_model& model,
  126. const std::string &prefix,
  127. ggml_tensor* input
  128. );
  129. extern "C" ggml_tensor* StandardFeedForwardNetwork_forward(
  130. fairseq2_model& model,
  131. const std::string& prefix,
  132. ggml_tensor* seqs
  133. );
  134. extern "C" ggml_tensor* SiluFeedForwardNetwork_forward(
  135. fairseq2_model& model,
  136. const std::string& prefix,
  137. ggml_tensor* seqs
  138. );
  139. extern "C" ggml_tensor* MultiheadAttention_forward(
  140. fairseq2_model& model,
  141. const std::string &prefix,
  142. ggml_tensor* queries, // (slen, d_in)
  143. ggml_tensor* keys, // (klen, d_in)
  144. ggml_tensor* values, // (klen, d_out)
  145. ggml_tensor* attn_mask // (klen, slen)
  146. );
  147. extern "C" ggml_tensor* PositionalEmbedding_forward(
  148. fairseq2_model& model,
  149. const std::string& prefix,
  150. ggml_tensor* embeds
  151. );
  152. extern "C" ggml_tensor* TransformerEmbeddingFrontend_forward(
  153. fairseq2_model& model,
  154. const std::string& prefix,
  155. ggml_tensor* seqs
  156. );
  157. extern "C" ggml_tensor* StandardTransformerEncoderLayer_forward(
  158. fairseq2_model& model,
  159. const std::string& prefix,
  160. ggml_tensor* seqs,
  161. ggml_tensor* padding_mask
  162. );
  163. extern "C" ggml_tensor* StandardTransformerEncoder_forward(
  164. fairseq2_model& model,
  165. const std::string& prefix,
  166. ggml_tensor* seqs,
  167. ggml_tensor* padding_mask
  168. );
  169. extern "C" ggml_tensor* RelativePositionMHA_forward(
  170. fairseq2_model& model,
  171. const std::string& prefix,
  172. ggml_tensor* seqs
  173. );
  174. extern "C" ggml_tensor* ConvModule_forward(
  175. fairseq2_model& model,
  176. const std::string& prefix,
  177. ggml_tensor* seqs
  178. );
  179. extern "C" ggml_tensor* StandardConformerEncoderLayer_forward(
  180. fairseq2_model& model,
  181. const std::string& prefix,
  182. ggml_tensor* seqs,
  183. ggml_tensor* padding_mask
  184. );
  185. extern "C" ggml_tensor* StandardConformerEncoder_forward(
  186. fairseq2_model& model,
  187. const std::string& prefix,
  188. ggml_tensor* seqs,
  189. ggml_tensor* padding_mask
  190. );
  191. extern "C" ggml_tensor* StandardConformerEncoderAdaptorLayer_forward(
  192. fairseq2_model& model,
  193. const std::string& prefix,
  194. ggml_tensor* seqs,
  195. ggml_tensor* padding_mask
  196. );
  197. extern "C" ggml_tensor* StandardConformerEncoderAdaptor_forward(
  198. fairseq2_model& model,
  199. const std::string& prefix,
  200. ggml_tensor* seqs,
  201. ggml_tensor* padding_mask
  202. );
  203. // Specifies the Layer Normalization order.
  204. // see fairseq2/nn/transformer/norm_order.py
  205. enum TransformerNormOrder {
  206. TRANSFORMER_NORM_ORDER_POST = 0,
  207. TRANSFORMER_NORM_ORDER_PRE = 1,
  208. TRANSFORMER_NORM_ORDER_PRE_WITH_NORMFORMER = 2
  209. };
  210. /// Holds the options to pass to a sequence generator.
  211. struct SequenceGeneratorOptions {
  212. /// The beam size.
  213. int beam_size = 5;
  214. /// The minimum length of generated sequences (including prefix sequence).
  215. int min_seq_len = 1;
  216. /// The terms ``a`` and ``b`` of ``ax + b`` where ``x`` is the source
  217. /// sequence length. The generated sequences (including prefix sequence) will
  218. /// have the maximum length of ``min(hard_max_seq_len, ax + b)``. See also
  219. /// ``hard_max_seq_len``.
  220. float soft_max_seq_len_a = 1;
  221. int soft_max_seq_len_b = 200;
  222. /// The hard limit on maximum length of generated sequences.
  223. int hard_max_seq_len = 1024;
  224. /// The length penalty, where values less than 1.0 favor shorter, values
  225. /// greater than 1.0 favor longer sequences.
  226. float len_penalty = 1.0;
  227. /// The unknown symbol penalty, where values less than 0 produce more UNKs,
  228. /// values greater than 0 produce fewer UNKs.
  229. float unk_penalty = 0.0;
  230. /// If ``True``, normalizes scores by the length of generated sequences.
  231. bool normalize_scores = true;
  232. // memory needed is largely a fn of model size + sentence length and beam_size
  233. int mem_mb = 256;
  234. };
  235. struct SequenceGeneratorJob {
  236. SequenceGeneratorOptions opts;
  237. ggml_tensor* prefix_seq;
  238. std::int32_t pad_idx;
  239. std::int32_t unk_idx;
  240. std::int32_t bos_idx;
  241. std::int32_t eos_idx;
  242. std::int32_t num_threads;
  243. };
  244. /// Represents a hypothesis produced by a sequence generator.
  245. struct Hypothesis {
  246. /// The generated sequence.
  247. ggml_tensor* seq;
  248. /// The score of the hypothesis.
  249. float score;
  250. /// The score of each individual sequence step.
  251. ggml_tensor* step_scores;
  252. /// The score of each lang tok at first decoding step, serving as LID
  253. ggml_tensor* lid_scores;
  254. };
  255. extern "C" Hypothesis* generate_sequence(
  256. fairseq2_model& model,
  257. const SequenceGeneratorJob& opts,
  258. ggml_tensor* encoder_output,
  259. ggml_tensor* encoder_padding_mask,
  260. ggml_context* result_ctx,
  261. int threads
  262. );
  263. extern "C" void fairseq2_spm_tokenize(fairseq2_model* model, const char* text, ggml_tensor* out);
  264. extern "C" std::size_t fairseq2_spm_detokenize(fairseq2_model* model, ggml_tensor* tokens, char* out);
  265. std::pair<std::vector<std::string>, std::vector<float>> fairseq2_spm_detokenize(fairseq2_model* model, ggml_tensor* tokens, ggml_tensor* scores, char* out);