common-ggml.cpp 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246
  1. #include "common-ggml.h"
  2. #include <regex>
  3. #include <map>
  4. static const std::map<std::string, enum ggml_ftype> GGML_FTYPE_MAP = {
  5. {"q4_0", GGML_FTYPE_MOSTLY_Q4_0},
  6. {"q4_1", GGML_FTYPE_MOSTLY_Q4_1},
  7. {"q5_0", GGML_FTYPE_MOSTLY_Q5_0},
  8. {"q5_1", GGML_FTYPE_MOSTLY_Q5_1},
  9. {"q8_0", GGML_FTYPE_MOSTLY_Q8_0},
  10. };
  11. void ggml_print_ftypes(FILE * fp) {
  12. for (auto it = GGML_FTYPE_MAP.begin(); it != GGML_FTYPE_MAP.end(); it++) {
  13. fprintf(fp, " type = \"%s\" or %d\n", it->first.c_str(), it->second);
  14. }
  15. }
  16. enum ggml_ftype ggml_parse_ftype(const char * str) {
  17. enum ggml_ftype ftype;
  18. if (str[0] == 'q') {
  19. const auto it = GGML_FTYPE_MAP.find(str);
  20. if (it == GGML_FTYPE_MAP.end()) {
  21. fprintf(stderr, "%s: unknown ftype '%s'\n", __func__, str);
  22. return GGML_FTYPE_UNKNOWN;
  23. }
  24. ftype = it->second;
  25. } else {
  26. ftype = (enum ggml_ftype) atoi(str);
  27. }
  28. return ftype;
  29. }
  30. bool ggml_common_quantize_0(
  31. std::ifstream & finp,
  32. std::ofstream & fout,
  33. const ggml_ftype ftype,
  34. const std::vector<std::string> & to_quant,
  35. const std::vector<std::string> & to_skip) {
  36. ggml_type qtype = GGML_TYPE_F32;
  37. switch (ftype) {
  38. case GGML_FTYPE_MOSTLY_Q4_0: qtype = GGML_TYPE_Q4_0; break;
  39. case GGML_FTYPE_MOSTLY_Q4_1: qtype = GGML_TYPE_Q4_1; break;
  40. case GGML_FTYPE_MOSTLY_Q5_0: qtype = GGML_TYPE_Q5_0; break;
  41. case GGML_FTYPE_MOSTLY_Q5_1: qtype = GGML_TYPE_Q5_1; break;
  42. case GGML_FTYPE_MOSTLY_Q8_0: qtype = GGML_TYPE_Q8_0; break;
  43. case GGML_FTYPE_UNKNOWN:
  44. case GGML_FTYPE_ALL_F32:
  45. case GGML_FTYPE_MOSTLY_F16:
  46. case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16:
  47. case GGML_FTYPE_MOSTLY_Q2_K:
  48. case GGML_FTYPE_MOSTLY_Q3_K:
  49. case GGML_FTYPE_MOSTLY_Q4_K:
  50. case GGML_FTYPE_MOSTLY_Q5_K:
  51. case GGML_FTYPE_MOSTLY_Q6_K:
  52. {
  53. fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
  54. return false;
  55. }
  56. };
  57. if (!ggml_is_quantized(qtype)) {
  58. fprintf(stderr, "%s: invalid quantization type %d (%s)\n", __func__, qtype, ggml_type_name(qtype));
  59. return false;
  60. }
  61. size_t total_size_org = 0;
  62. size_t total_size_new = 0;
  63. std::vector<float> work;
  64. std::vector<uint8_t> data_u8;
  65. std::vector<ggml_fp16_t> data_f16;
  66. std::vector<float> data_f32;
  67. std::vector<int64_t> hist_all(1 << 4, 0);
  68. while (true) {
  69. int32_t n_dims;
  70. int32_t length;
  71. int32_t ttype;
  72. finp.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
  73. finp.read(reinterpret_cast<char *>(&length), sizeof(length));
  74. finp.read(reinterpret_cast<char *>(&ttype), sizeof(ttype));
  75. if (finp.eof()) {
  76. break;
  77. }
  78. int32_t nelements = 1;
  79. int32_t ne[4] = { 1, 1, 1, 1 };
  80. for (int i = 0; i < n_dims; ++i) {
  81. finp.read (reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
  82. nelements *= ne[i];
  83. }
  84. std::string name(length, 0);
  85. finp.read (&name[0], length);
  86. printf("%64s - [%5d, %5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ne[2], ggml_type_name((ggml_type) ttype));
  87. bool quantize = false;
  88. // check if we should quantize this tensor
  89. for (const auto & s : to_quant) {
  90. if (std::regex_match(name, std::regex(s))) {
  91. quantize = true;
  92. break;
  93. }
  94. }
  95. // check if we should skip this tensor
  96. for (const auto & s : to_skip) {
  97. if (std::regex_match(name, std::regex(s))) {
  98. quantize = false;
  99. break;
  100. }
  101. }
  102. // quantize only 2D tensors
  103. quantize &= (n_dims == 2);
  104. if (quantize) {
  105. if (ttype != GGML_TYPE_F32 && ttype != GGML_TYPE_F16) {
  106. fprintf(stderr, "%s: unsupported ttype %d (%s) for integer quantization\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
  107. return false;
  108. }
  109. if (ttype == GGML_TYPE_F16) {
  110. data_f16.resize(nelements);
  111. finp.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_fp16_t));
  112. data_f32.resize(nelements);
  113. for (int i = 0; i < nelements; ++i) {
  114. data_f32[i] = ggml_fp16_to_fp32(data_f16[i]);
  115. }
  116. } else {
  117. data_f32.resize(nelements);
  118. finp.read(reinterpret_cast<char *>(data_f32.data()), nelements * sizeof(float));
  119. }
  120. ttype = qtype;
  121. } else {
  122. const int bpe = (ttype == 0) ? sizeof(float) : sizeof(uint16_t);
  123. data_u8.resize(nelements*bpe);
  124. finp.read(reinterpret_cast<char *>(data_u8.data()), nelements * bpe);
  125. }
  126. fout.write(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
  127. fout.write(reinterpret_cast<char *>(&length), sizeof(length));
  128. fout.write(reinterpret_cast<char *>(&ttype), sizeof(ttype));
  129. for (int i = 0; i < n_dims; ++i) {
  130. fout.write(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
  131. }
  132. fout.write(&name[0], length);
  133. if (quantize) {
  134. work.resize(nelements); // for quantization
  135. size_t cur_size = 0;
  136. std::vector<int64_t> hist_cur(1 << 4, 0);
  137. switch ((ggml_type) ttype) {
  138. case GGML_TYPE_Q4_0:
  139. {
  140. cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
  141. } break;
  142. case GGML_TYPE_Q4_1:
  143. {
  144. cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
  145. } break;
  146. case GGML_TYPE_Q5_0:
  147. {
  148. cur_size = ggml_quantize_q5_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
  149. } break;
  150. case GGML_TYPE_Q5_1:
  151. {
  152. cur_size = ggml_quantize_q5_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
  153. } break;
  154. case GGML_TYPE_Q8_0:
  155. {
  156. cur_size = ggml_quantize_q8_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
  157. } break;
  158. case GGML_TYPE_F32:
  159. case GGML_TYPE_F16:
  160. case GGML_TYPE_I8:
  161. case GGML_TYPE_I16:
  162. case GGML_TYPE_I32:
  163. case GGML_TYPE_Q8_1:
  164. case GGML_TYPE_Q2_K:
  165. case GGML_TYPE_Q3_K:
  166. case GGML_TYPE_Q4_K:
  167. case GGML_TYPE_Q5_K:
  168. case GGML_TYPE_Q6_K:
  169. case GGML_TYPE_Q8_K:
  170. case GGML_TYPE_COUNT:
  171. {
  172. fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
  173. return false;
  174. }
  175. }
  176. fout.write(reinterpret_cast<char *>(work.data()), cur_size);
  177. total_size_new += cur_size;
  178. printf("size = %8.2f MB -> %8.2f MB | hist: ", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);
  179. for (int i = 0; i < (int) hist_cur.size(); ++i) {
  180. hist_all[i] += hist_cur[i];
  181. }
  182. for (int i = 0; i < (int) hist_cur.size(); ++i) {
  183. printf("%5.3f ", hist_cur[i] / (float)nelements);
  184. }
  185. printf("\n");
  186. } else {
  187. printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0);
  188. fout.write(reinterpret_cast<char *>(data_u8.data()), data_u8.size());
  189. total_size_new += data_u8.size();
  190. }
  191. total_size_org += nelements * sizeof(float);
  192. }
  193. printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
  194. printf("%s: quant size = %8.2f MB | ftype = %d (%s)\n", __func__, total_size_new/1024.0/1024.0, ftype, ggml_type_name(qtype));
  195. {
  196. int64_t sum_all = 0;
  197. for (int i = 0; i < (int) hist_all.size(); ++i) {
  198. sum_all += hist_all[i];
  199. }
  200. printf("%s: hist: ", __func__);
  201. for (int i = 0; i < (int) hist_all.size(); ++i) {
  202. printf("%5.3f ", hist_all[i] / (float)sum_all);
  203. }
  204. printf("\n");
  205. }
  206. return true;
  207. }