model_loader.cpp 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226
  1. #include <string>
  2. #include "model_loader.h"
  3. #define DEBUG_MODEL_LOAD 0
  4. std::ifstream open_ggml_file(const char* fname) {
  5. printf("%s: loading model from '%s'\n", __func__, fname);
  6. auto fin = std::ifstream(std::string(fname), std::ios::binary);
  7. if (!fin) {
  8. fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname);
  9. throw std::invalid_argument("failed to open file."); // TODO Merge error message.
  10. }
  11. std::uint32_t magic;
  12. fin.read((char*)&magic, 4);
  13. if (magic != GGML_FILE_MAGIC) {
  14. fprintf(stderr, "%s: invalid model file '%s' (bad header %d)\n", __func__, fname, magic);
  15. throw std::invalid_argument("failed to open file."); // TODO Merge error message.
  16. }
  17. return fin;
  18. }
  19. void register_prefix(fairseq2_model &model, const std::string& name) {
  20. std::size_t i = name.find_last_of('.');
  21. while(i != std::string::npos && i > 0) {
  22. std::string prefix = name.substr(0, i);
  23. auto prev_tensor = model.tensors.find(prefix);
  24. if (prev_tensor != model.tensors.end()) {
  25. GGML_ASSERT(prev_tensor->second == nullptr);
  26. }
  27. model.tensors[prefix] = nullptr;
  28. i = name.find_last_of('.', i - 1);
  29. }
  30. }
  31. std::int64_t
  32. model_loader::load_model_weights(fairseq2_model &model, std::ifstream &fin)
  33. {
  34. std::int64_t num_tensor = 0;
  35. std::int64_t f32_ctx_size = 0;
  36. fin.read((char*) &num_tensor, sizeof(num_tensor));
  37. fin.read((char*) &f32_ctx_size, sizeof(f32_ctx_size));
  38. // TODO: it might be interesting to allow the caller to not upcast the weights to float32.
  39. // Note this require changing the on disk format
  40. bool as_float32 = true;
  41. std::int64_t f16_ctx_size = f32_ctx_size;
  42. // fin.read((char*) &f16_ctx_size, sizeof(f16_ctx_size));
  43. struct ggml_init_params params = {
  44. /*.mem_size =*/ as_float32 ? f32_ctx_size : f16_ctx_size,
  45. /*.mem_buffer =*/ NULL,
  46. /*.no_alloc =*/ false,
  47. };
  48. model.tensors_ctx = ggml_init(params);
  49. size_t model_size = 0;
  50. for (int i = 0; i < num_tensor; ++i) {
  51. std::string name = get_name(fin);
  52. if (name.length() == 0)
  53. break;
  54. auto tensor = load_tensor_value(fin, model.tensors_ctx, as_float32);
  55. if (tensor == nullptr) {
  56. // Abort in case of error, the input stream is corrupted at this point.
  57. printf("Error while reading tensor %s\n", name.c_str() );
  58. throw std::invalid_argument("Error while reading tensor from file.");
  59. }
  60. register_prefix(model, name);
  61. ggml_set_name(tensor, name.c_str());
  62. model.tensors[name] = tensor;
  63. if (DEBUG_MODEL_LOAD) {
  64. printf("%s [%5ld, %5ld], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), tensor->ne[0], tensor->ne[1], ggml_type_name(tensor->type), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
  65. }
  66. model_size += ggml_nbytes(tensor);
  67. }
  68. double mb = 1024.0 * 1024.0;
  69. printf("%s: model size: %8.2f MB, memory used: %8.2f MB, memory reserved: %8.2f MB\n",
  70. __func__,
  71. model_size / mb,
  72. ggml_used_mem(model.tensors_ctx) / mb,
  73. ggml_get_mem_size(model.tensors_ctx) / mb
  74. );
  75. return ggml_get_mem_size(model.tensors_ctx);
  76. }
  77. void assert_endianness() {
  78. union {
  79. unsigned int i;
  80. char c[4];
  81. } un;
  82. un.i = 0x12345678;
  83. if (un.c[0] == 0x78 && un.c[3] == 0x12) {
  84. printf("little-endian\n");
  85. }
  86. else if (un.c[0] == 0x12 && un.c[3] == 0x78) {
  87. printf("big-endian\n");
  88. GGML_ASSERT(false); // model_loader.cpp assumes the system is little-endian
  89. }
  90. else {
  91. printf("unknown-endian\n");
  92. GGML_ASSERT(false); // model_loader.cpp assumes the system is little-endian
  93. }
  94. }
  95. void model_loader::load_hparams(std::unordered_map<std::string, std::int64_t>& hparams, std::ifstream &fin)
  96. {
  97. std::int64_t num_params = 0;
  98. fin.read(reinterpret_cast<char*>(&num_params), sizeof num_params);
  99. GGML_ASSERT(fin.gcount() == 8);
  100. hparams.reserve(num_params);
  101. std::int64_t value;
  102. for (int i = 0; i < num_params; ++i) {
  103. std::string name = get_name(fin);
  104. if (name.length() == 0)
  105. break;
  106. fin.read((char*) &value, sizeof(value));
  107. hparams[name] = value;
  108. }
  109. }
  110. void model_loader::load_vocab(llama_vocab& vocab, std::ifstream &fin)
  111. {
  112. // vocab.special_bos_id = 1;
  113. // vocab.special_eos_id = 2;
  114. // vocab.special_unk_id = 0;
  115. // vocab.special_sep_id = -1;
  116. // vocab.special_pad_id = -1;
  117. std::int64_t vocab_size = 0;
  118. fin.read(reinterpret_cast<char*>(&vocab_size), sizeof(vocab_size));
  119. GGML_ASSERT(fin.gcount() == 8);
  120. vocab.token_to_id.reserve(vocab_size);
  121. vocab.id_to_token.reserve(vocab_size);
  122. std::string packed_vocab = get_name(fin);
  123. std::int64_t ctx_size = vocab_size * sizeof(float) + vocab_size + 2 * ggml_tensor_overhead();
  124. ctx_size *= 2;
  125. ggml_context* ctx = ggml_init(ggml_init_params{ctx_size, nullptr, false});
  126. ggml_tensor* lengths_tensor = load_tensor_value(fin, ctx, true);
  127. std::int8_t* lengths = (std::int8_t*)lengths_tensor->data;
  128. ggml_tensor* scores_tensor = load_tensor_value(fin, ctx, true);
  129. float* scores = ggml_get_data_f32(scores_tensor);
  130. int64_t offset = 0;
  131. for (int i = 0; i < vocab_size; ++i) {
  132. // TODO: we should use string view instead of copying each word in a new string
  133. std::string word = packed_vocab.substr(offset, lengths[i]);
  134. vocab.token_to_id[word] = i;
  135. vocab.id_to_token.push_back({word, scores[i], LLAMA_TOKEN_TYPE_NORMAL});
  136. offset += lengths[i] + 1;
  137. }
  138. // Since we copied lengths and scores, we don't need the context anymore.
  139. ggml_free(ctx);
  140. // vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
  141. // TODO: special tokens stuff ?
  142. }
  143. ggml_tensor* load_tensor_value(std::ifstream &fin, ggml_context* ctx, bool as_float32)
  144. {
  145. int32_t n_dims = 0;
  146. int32_t raw_type = 0;
  147. fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
  148. fin.read(reinterpret_cast<char *>(&raw_type), sizeof(raw_type));
  149. ggml_type type = ggml_type(raw_type);
  150. if (n_dims <= 0 || n_dims > GGML_MAX_DIMS || raw_type < 0 || raw_type > GGML_TYPE_COUNT) {
  151. return nullptr;
  152. }
  153. int64_t ne[4] = {1, 1, 1, 1};
  154. for (int i = 0; i < n_dims; ++i) {
  155. fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
  156. }
  157. ggml_tensor* tensor;
  158. if (as_float32 && type == GGML_TYPE_F16) {
  159. // read quantized weights from disk, and convert them to f32.
  160. tensor = ggml_new_tensor(ctx, GGML_TYPE_F32, n_dims, ne);
  161. ggml_fp16_t buf[128];
  162. int num_el = ggml_nelements(tensor);
  163. for (int i = 0; i < num_el; i += 128) {
  164. int block_size = std::min(128, num_el - i);
  165. fin.read(reinterpret_cast<char *>(&buf), ggml_type_size(type) * block_size);
  166. ggml_fp16_to_fp32_row((const ggml_fp16_t*)&buf, (float*)tensor->data + i, block_size);
  167. }
  168. } else {
  169. tensor = ggml_new_tensor(ctx, type, n_dims, ne);
  170. fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
  171. }
  172. return tensor;
  173. }
  174. std::string
  175. model_loader::get_name(std::ifstream& fin)
  176. {
  177. std::uint32_t length = 0;
  178. fin.read(reinterpret_cast<char *>(&length), sizeof(length));
  179. if (length == 0)
  180. return "";
  181. std::string name(length, 0);
  182. fin.read(&name[0], length);
  183. return name;
  184. }
  185. extern "C" int load_fairseq2_ggml_file(fairseq2_model& model, const char* fname) {
  186. model_loader loader;
  187. assert_endianness();
  188. auto fin = open_ggml_file(fname);
  189. loader.load_hparams(model.hparams, fin);
  190. loader.load_hparams(model.layer_config, fin);
  191. loader.load_vocab(model.vocab, fin);
  192. loader.load_model_weights(model, fin);
  193. return 0;
  194. }