fairseq2.cpp 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213
  1. #include "ggml.h"
  2. #include "fairseq2.h"
  3. /// allocate the fairseq2 model and hyperparameters
  4. extern "C" fairseq2_model* fairseq2_model_alloc() {
  5. // pre-allocate some memory to write hyperparameters and tensors pointers
  6. auto* model = new fairseq2_model;
  7. model->hparams = new std::uint8_t[8 * 1024];
  8. model->arch = new std::uint64_t[16 * 1024]; // max tensors allowed
  9. return model;
  10. };
  11. extern "C" void fairseq2_model_free(fairseq2_model* model) {
  12. delete (std::uint64_t*)(model->arch);
  13. delete (std::uint8_t*)model->hparams;
  14. delete model;
  15. };
  16. extern "C" std::string* std_string_alloc(char* c_str) {
  17. return new std::string(c_str);
  18. }
  19. extern "C" void std_string_free(std::string* str) {
  20. delete str;
  21. }
  22. // Linear
  23. std::size_t Linear_size(int32_t input_dim, int32_t output_dim)
  24. {
  25. return (input_dim * output_dim * ggml_type_size(GGML_TYPE_F32)) // weight
  26. + (output_dim * ggml_type_size(GGML_TYPE_F32)); // bias
  27. };
  28. void Linear_init(
  29. Linear& self,
  30. fairseq2_model& model,
  31. const std::string &prefix,
  32. int input_dim,
  33. int output_dim,
  34. bool bias
  35. ) {
  36. self.weight = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, output_dim, input_dim);
  37. model.tensors[prefix + ".weight"] = self.weight;
  38. if (bias) {
  39. self.bias = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, output_dim);
  40. model.tensors[prefix + ".inner_proj.bias"] = self.bias;
  41. }
  42. }
  43. extern "C" ggml_tensor*
  44. Linear_forward(
  45. fairseq2_model& model,
  46. const std::string &prefix,
  47. ggml_tensor* input // (d_in)
  48. ) {
  49. // Note: for now we assumed un-batched input
  50. ggml_tensor* weight = model.tensors[prefix + ".weight"]; // (d_in, d_out)
  51. ggml_tensor* bias = model.tensors[prefix + ".bias"]; // (d_out)
  52. return ggml_add(
  53. model.ctx,
  54. ggml_mul_mat(model.ctx, weight, input), // (d_out)
  55. bias
  56. );
  57. }
  58. // LayerNorm
  59. std::size_t LayerNorm_size(int32_t dim)
  60. {
  61. return 2 * dim * ggml_type_size(GGML_TYPE_F32); // weight and bias
  62. };
  63. void LayerNorm_init(
  64. LayerNorm& self,
  65. fairseq2_model& model,
  66. const std::string &prefix,
  67. int dim
  68. ) {
  69. self.weight = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, dim);
  70. model.tensors[prefix + ".weight"] = self.weight;
  71. self.bias = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, dim);
  72. model.tensors[prefix + ".bias"] = self.bias;
  73. }
  74. extern "C" ggml_tensor* LayerNorm_forward(
  75. fairseq2_model& model,
  76. const std::string &prefix,
  77. ggml_tensor* input) {
  78. ggml_tensor* weight = model.tensors[prefix + ".weight"];
  79. ggml_tensor* bias = model.tensors[prefix + ".bias"];
  80. auto ctx = model.ctx;
  81. // TODO: should `eps` be part of unity hparams ?
  82. input = ggml_norm(ctx, input, /*eps*/1e-5);
  83. return ggml_add(
  84. ctx,
  85. ggml_mul(ctx, ggml_repeat(ctx, weight, input), input),
  86. ggml_repeat(ctx, bias, input)
  87. );
  88. }
  89. std::size_t StandardFeedForwardNetwork_size(int32_t dim, int32_t inner_dim)
  90. {
  91. return LayerNorm_size(dim) + Linear_size(dim, inner_dim) + Linear_size(inner_dim, dim);
  92. };
  93. void StandardFeedForwardNetwork_init(
  94. StandardFeedForwardNetwork& self,
  95. fairseq2_model& model,
  96. const std::string &prefix,
  97. int model_dim,
  98. int inner_dim
  99. ) {
  100. Linear_init(self.inner_proj, model, prefix + ".inner_proj", model_dim, inner_dim, true);
  101. LayerNorm_init(self.inner_layer_norm, model, prefix + ".inner_layer_norm", inner_dim);
  102. Linear_init(self.output_proj, model, prefix + ".output_proj", inner_dim, model_dim, true);
  103. }
  104. extern "C" ggml_tensor* StandardFeedForwardNetwork_forward(
  105. fairseq2_model& model,
  106. const std::string& prefix,
  107. ggml_tensor* seqs
  108. ) {
  109. seqs = Linear_forward(model, prefix + ".inner_proj", seqs);
  110. // inner_activation = ReLu // TODO: allow other activation
  111. seqs = ggml_relu(model.ctx, seqs);
  112. if (model.tensors.find(prefix + ".inner_layer_norm.weight") != model.tensors.end()) {
  113. seqs = LayerNorm_forward(model, prefix + ".inner_layer_norm", seqs);
  114. }
  115. // TODO: inference dropout
  116. // if self.inner_dropout is not None:
  117. // seqs = self.inner_dropout(seqs)
  118. seqs = Linear_forward(model, prefix + ".output_proj", seqs);
  119. return seqs;
  120. }
  121. void MultiheadAttention_init(
  122. MultiheadAttention& self,
  123. fairseq2_model& model,
  124. const std::string &prefix,
  125. int model_dim,
  126. int num_heads
  127. ) {
  128. int bias = true;
  129. int num_key_value_heads = num_heads;
  130. int head_dim = model_dim / num_heads;
  131. Linear_init(self.q_proj, model, prefix + ".q_proj", model_dim, model_dim, bias);
  132. Linear_init(self.k_proj, model, prefix + ".k_proj", model_dim, head_dim * num_key_value_heads, bias);
  133. Linear_init(self.v_proj, model, prefix + ".v_proj", model_dim, model_dim, bias);
  134. // (H, 1, K_h)
  135. self.bias_k = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, num_heads, 1, head_dim * num_key_value_heads/ num_heads);
  136. // (H, 1, V_h)
  137. self.bias_v = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, num_heads, 1, model_dim / num_heads);
  138. }
  139. ggml_tensor* reshape_num_head(ggml_context* ctx, ggml_tensor* x, int num_heads) {
  140. int slen = x->ne[0];
  141. // (S, M) -> (S, K_proj)
  142. x = ggml_reshape_3d(ctx, x, slen, num_heads, x->ne[1] / num_heads);
  143. // (S, K_proj) -> (H, S, K_h)
  144. return ggml_transpose(ctx, x);
  145. }
  146. extern "C" ggml_tensor* // (d_in, seq_len)
  147. MultiheadAttention_forward(
  148. fairseq2_model& model,
  149. const std::string &prefix,
  150. ggml_tensor* queries, // (d_in, len_q)
  151. ggml_tensor* keys, // (d_in, len_k)
  152. ggml_tensor* values, // (d_out, len_k)
  153. ggml_tensor* mask // (seq_len, len_q)
  154. ) {
  155. int num_heads = 16;
  156. ggml_context* ctx = model.ctx;
  157. ggml_tensor* q = Linear_forward(model, prefix + ".q_proj", queries);
  158. q = reshape_num_head(ctx, q, num_heads);
  159. ggml_tensor* k = Linear_forward(model, prefix + ".k_proj", keys);
  160. k = reshape_num_head(ctx, k, num_heads);
  161. ggml_tensor* v = Linear_forward(model, prefix + ".q_proj", queries);
  162. v = reshape_num_head(ctx, v, num_heads);
  163. ggml_tensor* attn = ggml_flash_attn(model.ctx, q, k, v, /*masked*/true);
  164. attn = Linear_forward(model, prefix + ".output_proj", attn);
  165. return attn;
  166. // ggml_tensor* attn = SDPA_forward(q, k, v, nullptr);
  167. // // (H, S, V_h) -> (S, H, V_h)
  168. // attn = ggml_transpose(ctx, attn);
  169. // // (S, H, V_h) -> (S, V_proj)
  170. // attn = ggml_reshape_3d()
  171. }
  172. // extern "C" ggml_tensor* // (d_out, seq_len)
  173. // SDPA_forward(
  174. // fairseq2_model& model,
  175. // const std::string &prefix,
  176. // ggml_tensor* queries, // (d_in, len_q)
  177. // ggml_tensor* keys, // (d_in, len_k)
  178. // ggml_tensor* values, // (d_out, len_k)
  179. // ggml_tensor* mask // (seq_len, len_q)
  180. // ) {
  181. // return queries;
  182. // }