test-blas0.c 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267
  1. #include "ggml.h"
  2. #include <stdint.h>
  3. #include <stdio.h>
  4. #include <assert.h>
  5. #include <stdlib.h>
  6. #include <string.h>
  7. #include <time.h>
  8. #include <math.h>
  9. #include <sys/time.h>
  10. #include <arm_neon.h>
  11. #include <Accelerate/Accelerate.h>
  12. uint64_t get_time_us() {
  13. struct timeval tv;
  14. gettimeofday(&tv, NULL);
  15. return tv.tv_sec * 1000000 + tv.tv_usec;
  16. }
  17. //
  18. // naive implementation
  19. //
  20. void mul_mat_f32_0(
  21. const float * restrict src0, // M x K
  22. const float * restrict src1, // N x K (transposed)
  23. float * dst,
  24. int m, int n, int k) {
  25. for (int i = 0; i < m; i++) {
  26. for (int j = 0; j < n; j++) {
  27. float sum = 0;
  28. for (int l = 0; l < k; l++) {
  29. sum += src0[i*k + l] * src1[j*k + l];
  30. }
  31. dst[j*m + i] = sum;
  32. }
  33. }
  34. }
  35. int main(int argc, const char ** argv) {
  36. if (argc < 4) {
  37. printf("Usage: %s M N K\n", argv[0]);
  38. return 1;
  39. }
  40. const int n_threads = 1;
  41. int M = atoi(argv[1]);
  42. int N = atoi(argv[2]);
  43. int K = atoi(argv[3]);
  44. srand(time(NULL));
  45. if (M == 0) M = rand() % 1000 + 1;
  46. if (N == 0) N = rand() % 1000 + 1;
  47. if (K == 0) K = rand() % 1000 + 1;
  48. printf("M = %d, N = %d, K = %d\n", M, N, K);
  49. float * src0 = malloc(sizeof(float)*M*K);
  50. float * src1 = malloc(sizeof(float)*N*K);
  51. float * dst0 = malloc(sizeof(float)*M*N); // naive
  52. float * dst1 = malloc(sizeof(float)*M*N); // blas
  53. struct ggml_init_params params = {
  54. .mem_size = 2048ul*1024*1024,
  55. .mem_buffer = NULL,
  56. .no_alloc = false,
  57. };
  58. struct ggml_context * ctx0 = ggml_init(params);
  59. struct ggml_tensor * s0_f32 = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, K, M);
  60. struct ggml_tensor * s1_f32 = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, K, N);
  61. struct ggml_tensor * s0_f16 = ggml_new_tensor_2d(ctx0, GGML_TYPE_F16, K, M);
  62. struct ggml_tensor * s1_f16 = ggml_new_tensor_2d(ctx0, GGML_TYPE_F16, K, N);
  63. for (int j = 0; j < M; j++) {
  64. for (int i = 0; i < K; i++) {
  65. //src0[j*K + i] = j;
  66. src0[j*K + i] = 1e-3*(rand() % 1000);
  67. }
  68. }
  69. for (int j = 0; j < N; j++) {
  70. for (int i = 0; i < K; i++) {
  71. //src1[j*K + i] = j + 1;
  72. src1[j*K + i] = 1e-3*(rand() % 1000);
  73. }
  74. }
  75. // copy src0 to s0_f32
  76. {
  77. float * p_f32 = s0_f32->data;
  78. ggml_fp16_t * p_f16 = s0_f16->data;
  79. for (int i = 0; i < M; i++) {
  80. for (int j = 0; j < K; j++) {
  81. p_f32[i*K + j] = src0[i*K + j];
  82. p_f16[i*K + j] = ggml_fp32_to_fp16(src0[i*K + j]);
  83. }
  84. }
  85. }
  86. // copy src1 to s1_f32
  87. {
  88. float * p_f32 = s1_f32->data;
  89. ggml_fp16_t * p_f16 = s1_f16->data;
  90. for (int i = 0; i < N; i++) {
  91. for (int j = 0; j < K; j++) {
  92. p_f32[i*K + j] = src1[i*K + j];
  93. p_f16[i*K + j] = ggml_fp32_to_fp16(src1[i*K + j]);
  94. }
  95. }
  96. }
  97. const clock_t start = clock();
  98. const uint64_t start_us = get_time_us();
  99. double iM = 1.0/M;
  100. mul_mat_f32_0(src0, src1, dst0, M, N, K);
  101. // Use BLAS sgemm from Accelerate framework
  102. cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, N, M, K, 1.0f, src1, K, src0, K, 0.0f, dst1, M);
  103. struct ggml_tensor * dst2 = NULL;
  104. struct ggml_tensor * dst3 = NULL;
  105. {
  106. dst2 = ggml_mul_mat(ctx0, s0_f32, s1_f32);
  107. struct ggml_cgraph gf = ggml_build_forward(dst2);
  108. ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
  109. }
  110. {
  111. dst3 = ggml_mul_mat(ctx0, s0_f16, s1_f32);
  112. struct ggml_cgraph gf = ggml_build_forward(dst3);
  113. ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
  114. }
  115. bool ok_blas = true;
  116. bool ok_ggml_f32 = true;
  117. bool ok_ggml_f16 = true;
  118. // check BLAS
  119. for (int i = 0; i < M*N; i++) {
  120. if (fabs(dst0[i] - dst1[i])/fabs(dst0[i]) > 0.0001) {
  121. printf("dst0[%d] = %f, dst1[%d] = %f\n", i, dst0[i], i, dst1[i]);
  122. ok_blas = false;
  123. }
  124. }
  125. // check ggml (f32)
  126. {
  127. float * p = dst2->data;
  128. for (int i = 0; i < M*N; i++) {
  129. if (fabs(dst0[i] - p[i])/fabs(dst0[i]) > 0.0001) {
  130. printf("dst0[%d] = %f, dst2[%d] = %f\n", i, dst0[i], i, p[i]);
  131. ok_ggml_f32 = false;
  132. }
  133. }
  134. }
  135. // check ggml (f16)
  136. {
  137. float * p = dst3->data;
  138. for (int i = 0; i < M*N; i++) {
  139. if (fabs(dst0[i] - p[i])/fabs(dst0[i]) > 0.01) {
  140. printf("dst0[%d] = %f, dst3[%d] = %f\n", i, dst0[i], i, p[i]);
  141. ok_ggml_f16 = false;
  142. }
  143. }
  144. }
  145. {
  146. const clock_t end = clock();
  147. const uint64_t end_us = get_time_us();
  148. printf("%s: elapsed ticks: %ld\n", __func__, end - start);
  149. }
  150. #if 0
  151. // print src0
  152. printf("src0:\n");
  153. for (int i = 0; i < M; i++) {
  154. for (int j = 0; j < K; j++) {
  155. printf("%4.1f ", src0[i*K+j]);
  156. }
  157. printf("\n");
  158. }
  159. // print src1
  160. printf("src1:\n");
  161. for (int i = 0; i < N; i++) {
  162. for (int j = 0; j < K; j++) {
  163. printf("%4.1f ", src1[i*K+j]);
  164. }
  165. printf("\n");
  166. }
  167. printf("\n");
  168. printf("dst0 (naive):\n");
  169. for (int j = 0; j < N; j++) {
  170. for (int i = 0; i < M; i++) {
  171. printf("%4.1f ", dst0[j*M+i]);
  172. }
  173. printf("\n");
  174. }
  175. printf("\n");
  176. printf("dst1 (BLAS):\n");
  177. for (int j = 0; j < N; j++) {
  178. for (int i = 0; i < M; i++) {
  179. printf("%4.1f ", dst1[j*M+i]);
  180. }
  181. printf("\n");
  182. }
  183. printf("\n");
  184. printf("dst2 (ggml f32):\n");
  185. for (int j = 0; j < N; j++) {
  186. for (int i = 0; i < M; i++) {
  187. printf("%4.1f ", ((float *)dst2->data)[j*M+i]);
  188. }
  189. printf("\n");
  190. }
  191. printf("\n");
  192. printf("dst3 (ggml f16):\n");
  193. for (int j = 0; j < N; j++) {
  194. for (int i = 0; i < M; i++) {
  195. printf("%4.1f ", ((float *)dst3->data)[j*M+i]);
  196. }
  197. printf("\n");
  198. }
  199. printf("\n");
  200. #endif
  201. free(src0);
  202. free(src1);
  203. free(dst0);
  204. free(dst1);
  205. ggml_free(ctx0);
  206. printf("ok_blas = %d\n", ok_blas);
  207. if (!ok_blas) {
  208. printf("ERROR: BLAS failed\n");
  209. }
  210. printf("ok_ggml_f32 = %d\n", ok_ggml_f32);
  211. if (!ok_ggml_f32) {
  212. printf("ERROR: ggml failed\n");
  213. }
  214. printf("ok_ggml_f16 = %d\n", ok_ggml_f16);
  215. if (!ok_ggml_f16) {
  216. printf("ERROR: ggml failed\n");
  217. }
  218. return (ok_blas && ok_ggml_f32 && ok_ggml_f16) ? 0 : 1;
  219. }