unity.cpp 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485
  1. #include "ggml/ggml.h"
  2. #include "ggml/ggml-alloc.h"
  3. #include "common.h"
  4. #include "common-ggml.h"
  5. #include <cassert>
  6. #include <cmath>
  7. #include <cstdio>
  8. #include <cstring>
  9. #include <fstream>
  10. #include <map>
  11. #include <string>
  12. #include <vector>
  13. #include <iostream>
  14. // default hparams
  15. struct unity_hparams {
  16. int32_t n_text_vocab = 256206;
  17. int32_t n_unit_vocab = 10084;
  18. int32_t n_audio_enc_dim = 1024;
  19. int32_t n_audio_enc_ffn_dim = 4096;
  20. int32_t n_audio_enc_feat_dim = 160;
  21. int32_t n_audio_enc_layer = 24;
  22. int32_t n_audio_enc_head = 16;
  23. int32_t ftype = 1;
  24. float eps = 1e-5f;
  25. };
  26. // layer def
  27. struct audio_enc_layer {
  28. struct ggml_tensor * self_attn_layer_norm_w;
  29. struct ggml_tensor * self_attn_layer_norm_b;
  30. struct ggml_tensor * self_attn_linear_k_w;
  31. struct ggml_tensor * self_attn_linear_k_b;
  32. struct ggml_tensor * self_attn_linear_q_w;
  33. struct ggml_tensor * self_attn_linear_q_b;
  34. struct ggml_tensor * self_attn_linear_v_w;
  35. struct ggml_tensor * self_attn_linear_v_b;
  36. struct ggml_tensor * self_attn_linear_out_w;
  37. struct ggml_tensor * self_attn_linear_out_b;
  38. struct ggml_tensor * self_attn_linear_pos_w;
  39. struct ggml_tensor * self_attn_pos_bias_u;
  40. struct ggml_tensor * self_attn_pos_bias_v;
  41. struct ggml_tensor * conv_layer_norm_w;
  42. struct ggml_tensor * conv_layer_norm_b;
  43. struct ggml_tensor * conv_pointwise_conv1_w;
  44. struct ggml_tensor * conv_depthwise_conv_w;
  45. struct ggml_tensor * conv_batch_norm_w;
  46. struct ggml_tensor * conv_batch_norm_b;
  47. struct ggml_tensor * conv_batch_norm_running_mean;
  48. struct ggml_tensor * conv_batch_norm_running_var;
  49. struct ggml_tensor * conv_batch_norm_num_batches_tracked;
  50. struct ggml_tensor * conv_pointwise_conv2_w;
  51. struct ggml_tensor * ffn1_layer_norm_w;
  52. struct ggml_tensor * ffn1_layer_norm_b;
  53. struct ggml_tensor * ffn1_w1;
  54. struct ggml_tensor * ffn1_b1;
  55. struct ggml_tensor * ffn1_w2;
  56. struct ggml_tensor * ffn1_b2;
  57. struct ggml_tensor * ffn2_layer_norm_w;
  58. struct ggml_tensor * ffn2_layer_norm_b;
  59. struct ggml_tensor * ffn2_w1;
  60. struct ggml_tensor * ffn2_b1;
  61. struct ggml_tensor * ffn2_w2;
  62. struct ggml_tensor * ffn2_b2;
  63. struct ggml_tensor * final_layer_norm_w;
  64. struct ggml_tensor * final_layer_norm_b;
  65. };
  66. // struct ggml_tensor * conv_ln;
  67. // struct ggml_tensor * conv_pool_1d;
  68. // model def
  69. struct unity_model {
  70. unity_hparams hparams;
  71. // audio encoder
  72. struct ggml_tensor * post_extract_proj;
  73. struct ggml_tensor * audio_enc_pos_conv;
  74. std::vector<audio_enc_layer> audio_enc_layers;
  75. // text encoder
  76. // std::vector<text_enc_layer> text_enc_layers;
  77. // adaptor
  78. // std::vector<adapter_layer> adapter_layers;
  79. // text decoder
  80. // std::vector<text_dec_layer> text_dec_layers;
  81. // unit decoder
  82. // std::vector<unit_dec_layer> unit_dec_layers;
  83. //
  84. struct ggml_context * ctx;
  85. std::map<std::string, struct ggml_tensor *> tensors;
  86. };
  87. // model load
  88. bool unity_model_load(const std::string & fname, unity_model & model, gpt_vocab & vocab) {
  89. printf("%s: loading model from '%s'\n", __func__, fname.c_str());
  90. auto fin = std::ifstream(fname, std::ios::binary);
  91. if (!fin) {
  92. fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
  93. return false;
  94. }
  95. // verify magic
  96. {
  97. uint32_t magic;
  98. fin.read((char *) &magic, sizeof(magic));
  99. if (magic != GGML_FILE_MAGIC) {
  100. fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
  101. return false;
  102. }
  103. }
  104. // load hparams
  105. {
  106. auto & hparams = model.hparams;
  107. fin.read((char *) &hparams.n_text_vocab, sizeof(hparams.n_text_vocab));
  108. fin.read((char *) &hparams.n_audio_enc_dim, sizeof(hparams.n_audio_enc_dim));
  109. fin.read((char *) &hparams.n_audio_enc_ffn_dim, sizeof(hparams.n_audio_enc_ffn_dim));
  110. fin.read((char *) &hparams.n_audio_enc_feat_dim, sizeof(hparams.n_audio_enc_feat_dim));
  111. fin.read((char *) &hparams.n_audio_enc_layer, sizeof(hparams.n_audio_enc_layer));
  112. fin.read((char *) &hparams.n_audio_enc_head, sizeof(hparams.n_audio_enc_head));
  113. fin.read((char *) &hparams.ftype, sizeof(hparams.ftype));
  114. const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
  115. printf("%s: n_text_vocab = %d\n", __func__, hparams.n_text_vocab);
  116. printf("%s: n_audio_enc_dim = %d\n", __func__, hparams.n_audio_enc_dim);
  117. printf("%s: n_audio_enc_ffn_dim = %d\n", __func__, hparams.n_audio_enc_ffn_dim);
  118. printf("%s: n_audio_enc_feat_dim = %d\n", __func__, hparams.n_audio_enc_feat_dim);
  119. printf("%s: n_audio_enc_layer = %d\n", __func__, hparams.n_audio_enc_layer);
  120. printf("%s: n_audio_enc_head = %d\n", __func__, hparams.n_audio_enc_head);
  121. printf("%s: ftype = %d\n", __func__, hparams.ftype);
  122. printf("%s: qntvr = %d\n", __func__, qntvr);
  123. hparams.ftype %= GGML_QNT_VERSION_FACTOR;
  124. }
  125. // for the big tensors, we have the option to store the data in 16-bit floats or quantized
  126. // in order to save memory and also to speed up the computation
  127. ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
  128. if (wtype == GGML_TYPE_COUNT) {
  129. fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
  130. __func__, fname.c_str(), model.hparams.ftype);
  131. return false;
  132. }
  133. auto & ctx = model.ctx;
  134. size_t ctx_size = 0;
  135. {
  136. const auto & hparams = model.hparams;
  137. const int n_audio_enc_dim = hparams.n_audio_enc_dim;
  138. const int n_audio_enc_ffn_dim = hparams.n_audio_enc_ffn_dim;
  139. const int n_audio_enc_layer = hparams.n_audio_enc_layer;
  140. // const int n_text_vocab = hparams.n_text_vocab;
  141. const int kernel_size = 31;
  142. ctx_size += n_audio_enc_layer*n_audio_enc_dim*ggml_type_sizef(GGML_TYPE_F32); // self_attn_layer_norm_w
  143. ctx_size += n_audio_enc_layer*n_audio_enc_dim*ggml_type_sizef(GGML_TYPE_F32); // self_attn_layer_norm_b
  144. ctx_size += n_audio_enc_layer*(5*n_audio_enc_dim*n_audio_enc_dim*ggml_type_sizef(wtype)); // self_attn_w
  145. ctx_size += n_audio_enc_layer*(4*n_audio_enc_dim*ggml_type_sizef(GGML_TYPE_F32)); // self_attn_b
  146. ctx_size += n_audio_enc_layer*n_audio_enc_dim*ggml_type_sizef(GGML_TYPE_F32); // conv_layer_norm_w
  147. ctx_size += n_audio_enc_layer*n_audio_enc_dim*ggml_type_sizef(GGML_TYPE_F32); // conv_layer_norm_b
  148. ctx_size += n_audio_enc_layer*(n_audio_enc_dim*n_audio_enc_dim*2*ggml_type_sizef(wtype)); // conv_pointwise_conv1_w
  149. ctx_size += n_audio_enc_dim*ggml_type_sizef(GGML_TYPE_F32); // conv_batch_norm_w
  150. ctx_size += n_audio_enc_dim*ggml_type_sizef(GGML_TYPE_F32); // conv_batch_norm_b
  151. ctx_size += n_audio_enc_layer*(n_audio_enc_dim*n_audio_enc_dim*kernel_size*ggml_type_sizef(wtype)); // conv_depthwise_conv_w
  152. ctx_size += n_audio_enc_layer*(n_audio_enc_dim*n_audio_enc_dim*ggml_type_sizef(wtype)); // conv_pointwise_conv2_w
  153. ctx_size += 2 * n_audio_enc_layer * (n_audio_enc_dim*ggml_type_sizef(GGML_TYPE_F32)); // ffn{1,2}_layer_norm_w
  154. ctx_size += 2 * n_audio_enc_layer * (n_audio_enc_dim*ggml_type_sizef(GGML_TYPE_F32)); // ffn{1,2}_layer_norm_b
  155. ctx_size += 2 * n_audio_enc_layer * (2 * n_audio_enc_dim * n_audio_enc_ffn_dim * ggml_type_sizef(wtype)); // ffn{1,2}_w{1,2}
  156. ctx_size += 2 * n_audio_enc_layer * (2 * n_audio_enc_dim * ggml_type_sizef(GGML_TYPE_F32)); // ffn{1,2}_b{1,2}
  157. ctx_size += n_audio_enc_layer*(n_audio_enc_dim*ggml_type_sizef(GGML_TYPE_F32)); // final_layer_norm_w
  158. ctx_size += n_audio_enc_layer*(n_audio_enc_dim*ggml_type_sizef(GGML_TYPE_F32)); // final_layer_norm_b
  159. // Adaptor
  160. // ctx_size += n_audio_enc_layer*(n_audio_enc_dim*ggml_type_sizef(GGML_TYPE_F32)); // conv_ln
  161. // ctx_size += n_audio_enc_layer*(n_audio_enc_dim*ggml_type_sizef(GGML_TYPE_F32)); // conv_pool_1d
  162. // object overhead might differ depending on the structure and other miscellaneous factors
  163. ctx_size += (6 + 12*n_audio_enc_layer)*512; // updated object overhead
  164. printf("%s: ggml tensor size = %d bytes\n", __func__, (int) sizeof(ggml_tensor));
  165. printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
  166. }
  167. // create the ggml context
  168. {
  169. struct ggml_init_params params = {
  170. /*.mem_size =*/ ctx_size,
  171. /*.mem_buffer =*/ NULL,
  172. /*.no_alloc =*/ false,
  173. };
  174. model.ctx = ggml_init(params);
  175. if (!model.ctx) {
  176. fprintf(stderr, "%s: ggml_init() failed\n", __func__);
  177. return false;
  178. }
  179. }
  180. // prepare memory for the weights
  181. {
  182. const auto & hparams = model.hparams;
  183. const int n_audio_enc_dim = hparams.n_audio_enc_dim;
  184. const int n_audio_enc_ffn_dim = hparams.n_audio_enc_ffn_dim;
  185. // const int n_audio_enc_feat_dim = hparams.n_audio_enc_feat_dim;
  186. const int n_audio_enc_layer = hparams.n_audio_enc_layer;
  187. const int n_audio_enc_head = hparams.n_audio_enc_head;
  188. // const int n_text_vocab = hparams.n_text_vocab;
  189. model.audio_enc_layers.resize(n_audio_enc_layer);
  190. // model.post_extract_proj_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_audio_enc_dim, n_audio_enc_feat_dim);
  191. // model.post_extract_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
  192. // model.tensors["model/post_extract_proj/w"] = model.post_extract_proj_w
  193. // model.tensors["model/post_extract_proj/b"] = model.post_extract_proj_b
  194. // model.audio_enc_pos_conv_w = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_audio_enc_dim, n_audio_enc_dim, 1);
  195. // model.tensors["model/audio_enc_pos_conv/w"] = model.audio_enc_pos_conv_w;
  196. // model.audio_enc_pos_conv_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
  197. // model.tensors["model/audio_enc_pos_conv/b"] = model.audio_enc_pos_conv_b;
  198. for (int i = 0; i < n_audio_enc_layer; ++i) {
  199. auto & layer = model.audio_enc_layers[i];
  200. layer.self_attn_layer_norm_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
  201. layer.self_attn_layer_norm_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
  202. layer.self_attn_linear_k_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_audio_enc_dim, n_audio_enc_dim);
  203. layer.self_attn_linear_k_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
  204. layer.self_attn_linear_q_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_audio_enc_dim, n_audio_enc_dim);
  205. layer.self_attn_linear_q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
  206. layer.self_attn_linear_v_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_audio_enc_dim, n_audio_enc_dim);
  207. layer.self_attn_linear_v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
  208. layer.self_attn_linear_out_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_audio_enc_dim, n_audio_enc_dim);
  209. layer.self_attn_linear_out_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
  210. layer.self_attn_linear_pos_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_audio_enc_dim, n_audio_enc_dim);
  211. layer.self_attn_pos_bias_u = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_audio_enc_head, n_audio_enc_dim / n_audio_enc_head);
  212. layer.self_attn_pos_bias_v = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_audio_enc_head, n_audio_enc_dim / n_audio_enc_head);
  213. layer.conv_layer_norm_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
  214. layer.conv_layer_norm_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
  215. layer.conv_pointwise_conv1_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 2*n_audio_enc_dim, n_audio_enc_dim);
  216. layer.conv_depthwise_conv_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_audio_enc_dim, 31);
  217. layer.conv_batch_norm_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
  218. layer.conv_batch_norm_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
  219. layer.conv_batch_norm_running_mean = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
  220. layer.conv_batch_norm_running_var = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
  221. layer.conv_batch_norm_num_batches_tracked = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
  222. layer.conv_pointwise_conv2_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_audio_enc_dim, n_audio_enc_dim);
  223. layer.ffn1_layer_norm_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
  224. layer.ffn1_layer_norm_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
  225. layer.ffn1_w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_audio_enc_ffn_dim, n_audio_enc_dim);
  226. layer.ffn1_b1 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_ffn_dim);
  227. layer.ffn1_w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_audio_enc_dim, n_audio_enc_ffn_dim);
  228. layer.ffn1_b2 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
  229. layer.ffn2_layer_norm_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
  230. layer.ffn2_layer_norm_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
  231. layer.ffn2_w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_audio_enc_ffn_dim, n_audio_enc_dim);
  232. layer.ffn2_b1 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_ffn_dim);
  233. layer.ffn2_w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_audio_enc_dim, n_audio_enc_ffn_dim);
  234. layer.ffn2_b2 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
  235. layer.final_layer_norm_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
  236. layer.final_layer_norm_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_enc_dim);
  237. // map by name
  238. model.tensors["model/h" + std::to_string(i) + "/self_attn_layer_norm/w"] = layer.self_attn_layer_norm_w;
  239. model.tensors["model/h" + std::to_string(i) + "/self_attn_layer_norm/b"] = layer.self_attn_layer_norm_b;
  240. model.tensors["model/h" + std::to_string(i) + "/self_attn_linear_k/w"] = layer.self_attn_linear_k_w;
  241. model.tensors["model/h" + std::to_string(i) + "/self_attn_linear_k/b"] = layer.self_attn_linear_k_b;
  242. model.tensors["model/h" + std::to_string(i) + "/self_attn_linear_q/w"] = layer.self_attn_linear_q_w;
  243. model.tensors["model/h" + std::to_string(i) + "/self_attn_linear_q/b"] = layer.self_attn_linear_q_b;
  244. model.tensors["model/h" + std::to_string(i) + "/self_attn_linear_v/w"] = layer.self_attn_linear_v_w;
  245. model.tensors["model/h" + std::to_string(i) + "/self_attn_linear_v/b"] = layer.self_attn_linear_v_b;
  246. model.tensors["model/h" + std::to_string(i) + "/self_attn_linear_out/w"] = layer.self_attn_linear_out_w;
  247. model.tensors["model/h" + std::to_string(i) + "/self_attn_linear_out/b"] = layer.self_attn_linear_out_b;
  248. model.tensors["model/h" + std::to_string(i) + "/self_attn_linear_pos/w"] = layer.self_attn_linear_pos_w;
  249. model.tensors["model/h" + std::to_string(i) + "/self_attn_pos_bias/u"] = layer.self_attn_pos_bias_u;
  250. model.tensors["model/h" + std::to_string(i) + "/self_attn_pos_bias/v"] = layer.self_attn_pos_bias_v;
  251. model.tensors["model/h" + std::to_string(i) + "/conv_layer_norm/w"] = layer.conv_layer_norm_w;
  252. model.tensors["model/h" + std::to_string(i) + "/conv_layer_norm/b"] = layer.conv_layer_norm_b;
  253. model.tensors["model/h" + std::to_string(i) + "/conv_pointwise_conv1/w"] = layer.conv_pointwise_conv1_w;
  254. model.tensors["model/h" + std::to_string(i) + "/conv_depthwise_conv/w"] = layer.conv_depthwise_conv_w;
  255. model.tensors["model/h" + std::to_string(i) + "/conv_batch_norm/w"] = layer.conv_batch_norm_w;
  256. model.tensors["model/h" + std::to_string(i) + "/conv_batch_norm/b"] = layer.conv_batch_norm_b;
  257. model.tensors["model/h" + std::to_string(i) + "/conv_batch_norm/m"] = layer.conv_batch_norm_running_mean;
  258. model.tensors["model/h" + std::to_string(i) + "/conv_batch_norm/v"] = layer.conv_batch_norm_running_var;
  259. model.tensors["model/h" + std::to_string(i) + "/conv_batch_norm/n"] = layer.conv_batch_norm_num_batches_tracked;
  260. model.tensors["model/h" + std::to_string(i) + "/conv_pointwise_conv2/w"] = layer.conv_pointwise_conv2_w;
  261. model.tensors["model/h" + std::to_string(i) + "/ffn1_layer_norm/w"] = layer.ffn1_layer_norm_w;
  262. model.tensors["model/h" + std::to_string(i) + "/ffn1_layer_norm/b"] = layer.ffn1_layer_norm_b;
  263. model.tensors["model/h" + std::to_string(i) + "/ffn1_w_1/w"] = layer.ffn1_w1;
  264. model.tensors["model/h" + std::to_string(i) + "/ffn1_w_1/b"] = layer.ffn1_b1;
  265. model.tensors["model/h" + std::to_string(i) + "/ffn1_w_2/w"] = layer.ffn1_w2;
  266. model.tensors["model/h" + std::to_string(i) + "/ffn1_w_2/b"] = layer.ffn1_b2;
  267. model.tensors["model/h" + std::to_string(i) + "/ffn2_layer_norm/w"] = layer.ffn2_layer_norm_w;
  268. model.tensors["model/h" + std::to_string(i) + "/ffn2_layer_norm/b"] = layer.ffn2_layer_norm_b;
  269. model.tensors["model/h" + std::to_string(i) + "/ffn2_w_1/w"] = layer.ffn2_w1;
  270. model.tensors["model/h" + std::to_string(i) + "/ffn2_w_1/b"] = layer.ffn2_b1;
  271. model.tensors["model/h" + std::to_string(i) + "/ffn2_w_2/w"] = layer.ffn2_w2;
  272. model.tensors["model/h" + std::to_string(i) + "/ffn2_w_2/b"] = layer.ffn2_b2;
  273. model.tensors["model/h" + std::to_string(i) + "/final_layer_norm/w"] = layer.final_layer_norm_w;
  274. model.tensors["model/h" + std::to_string(i) + "/final_layer_norm/b"] = layer.final_layer_norm_b;
  275. }
  276. }
  277. // load weights
  278. {
  279. size_t total_size = 0;
  280. while (true) {
  281. int32_t n_dims;
  282. int32_t length;
  283. int32_t ttype;
  284. fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
  285. fin.read(reinterpret_cast<char *>(&length), sizeof(length));
  286. fin.read(reinterpret_cast<char *>(&ttype), sizeof(ttype));
  287. if (fin.eof()) {
  288. break;
  289. }
  290. int32_t nelements = 1;
  291. int32_t ne[3] = { 1, 1, 1};
  292. for (int i = 0; i < n_dims; ++i) {
  293. fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
  294. nelements *= ne[i];
  295. }
  296. std::string name(length, 0);
  297. fin.read(&name[0], length);
  298. std::cout << "loading " << name << " " << n_dims << std::endl;
  299. if (model.tensors.find(name) == model.tensors.end()) {
  300. fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str());
  301. return false;
  302. }
  303. auto tensor = model.tensors[name];
  304. if (ggml_nelements(tensor) != nelements) {
  305. fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.c_str());
  306. return false;
  307. }
  308. if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
  309. fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
  310. __func__, name.c_str(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
  311. return false;
  312. }
  313. // for debugging
  314. if (0) {
  315. printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
  316. }
  317. const size_t bpe = ggml_type_size(ggml_type(ttype));
  318. if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
  319. fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
  320. __func__, name.c_str(), ggml_nbytes(tensor), nelements*bpe);
  321. return false;
  322. }
  323. fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
  324. total_size += ggml_nbytes(tensor);
  325. }
  326. printf("%s: model size = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
  327. }
  328. fin.close();
  329. return true;
  330. }
  331. int main(int argc, char ** argv) {
  332. // ggml_time_init();
  333. // const int64_t t_main_start_us = ggml_time_us();
  334. gpt_params params;
  335. params.model = "models/gpt-2-117M/ggml-model.bin";
  336. if (gpt_params_parse(argc, argv, params) == false) {
  337. return 1;
  338. }
  339. if (params.seed < 0) {
  340. params.seed = time(NULL);
  341. }
  342. printf("%s: seed = %d\n", __func__, params.seed);
  343. std::mt19937 rng(params.seed);
  344. if (params.prompt.empty()) {
  345. params.prompt = gpt_random_prompt(rng);
  346. }
  347. // int64_t t_load_us = 0;
  348. gpt_vocab vocab;
  349. unity_model model;
  350. // load the model
  351. {
  352. // const int64_t t_start_us = ggml_time_us();
  353. if (!unity_model_load(params.model, model, vocab)) {
  354. fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
  355. return 1;
  356. }
  357. // t_load_us = ggml_time_us() - t_start_us;
  358. // test_gpt_tokenizer(vocab, params.token_test);
  359. }
  360. // keep this buffer alive while evaluating the model
  361. // std::vector<uint8_t> compute_buffer;
  362. // struct ggml_allocr * allocr = NULL;
  363. // // allocate the compute buffer
  364. // {
  365. // allocr = ggml_allocr_new_measure(GGML_MEM_ALIGN);
  366. // // create the worst case graph for memory usage estimation
  367. // int n_tokens = std::min(model.hparams.n_ctx, params.n_batch);
  368. // int n_past = model.hparams.n_ctx - n_tokens;
  369. // struct ggml_cgraph * gf = gpt2_graph(model, allocr, n_past, std::vector<gpt_vocab::id>(n_tokens, 0));
  370. // // compute the required memory
  371. // size_t mem_size = ggml_allocr_alloc_graph(allocr, gf) + GGML_MEM_ALIGN;
  372. // // recreate the allocator with the required memory
  373. // ggml_allocr_free(allocr);
  374. // compute_buffer.resize(mem_size);
  375. // allocr = ggml_allocr_new(compute_buffer.data(), mem_size, GGML_MEM_ALIGN);
  376. // fprintf(stderr, "%s: compute buffer size: %.2f MB\n", __func__, mem_size/1024.0/1024.0);
  377. // }
  378. ggml_free(model.ctx);
  379. return 0;
  380. }