test-vec2.c 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268
  1. #include <stdint.h>
  2. #include <stdio.h>
  3. #include <assert.h>
  4. #include <stdlib.h>
  5. #include <time.h>
  6. #include <math.h>
  7. #include <sys/time.h>
  8. #include <arm_neon.h>
  9. const int N = 1 << 12;
  10. const int M = 1 << 12;
  11. //
  12. // naive implementation
  13. //
  14. void mul_mat_vec_f32_0(
  15. const float * restrict src0,
  16. const float * restrict src1,
  17. float * dst,
  18. int nrows,
  19. int ncols) {
  20. for (int i = 0; i < nrows; i++) {
  21. float sum = 0.0f;
  22. for (int j = 0; j < ncols; j++) {
  23. sum += src0[i*ncols + j]*src1[j];
  24. }
  25. dst[i] = sum;
  26. }
  27. }
  28. void mul_mat_vec_f16_0(
  29. const __fp16 * src0,
  30. const __fp16 * src1,
  31. float * dst,
  32. int nrows,
  33. int ncols) {
  34. const int n64 = ncols & ~63;
  35. for (int r = 0; r < nrows; r++) {
  36. float sumf = 0.0;
  37. float16x8_t sum0 = vdupq_n_f16(0.0f);
  38. float16x8_t sum1 = vdupq_n_f16(0.0f);
  39. float16x8_t sum2 = vdupq_n_f16(0.0f);
  40. float16x8_t sum3 = vdupq_n_f16(0.0f);
  41. float16x8_t sum4 = vdupq_n_f16(0.0f);
  42. float16x8_t sum5 = vdupq_n_f16(0.0f);
  43. float16x8_t sum6 = vdupq_n_f16(0.0f);
  44. float16x8_t sum7 = vdupq_n_f16(0.0f);
  45. float16x8_t x0, x1, x2, x3, x4, x5, x6, x7;
  46. float16x8_t y0, y1, y2, y3, y4, y5, y6, y7;
  47. const __fp16 * restrict p0 = src0 + r*ncols;
  48. for (int i = 0; i < n64; i += 64) {
  49. x0 = vld1q_f16(p0 + i + 0 );
  50. x1 = vld1q_f16(p0 + i + 8 );
  51. x2 = vld1q_f16(p0 + i + 16);
  52. x3 = vld1q_f16(p0 + i + 24);
  53. x4 = vld1q_f16(p0 + i + 32);
  54. x5 = vld1q_f16(p0 + i + 40);
  55. x6 = vld1q_f16(p0 + i + 48);
  56. x7 = vld1q_f16(p0 + i + 56);
  57. y0 = vld1q_f16(src1 + i + 0 );
  58. y1 = vld1q_f16(src1 + i + 8 );
  59. y2 = vld1q_f16(src1 + i + 16);
  60. y3 = vld1q_f16(src1 + i + 24);
  61. y4 = vld1q_f16(src1 + i + 32);
  62. y5 = vld1q_f16(src1 + i + 40);
  63. y6 = vld1q_f16(src1 + i + 48);
  64. y7 = vld1q_f16(src1 + i + 56);
  65. sum0 = vfmaq_f16(sum0, x0, y0);
  66. sum1 = vfmaq_f16(sum1, x1, y1);
  67. sum2 = vfmaq_f16(sum2, x2, y2);
  68. sum3 = vfmaq_f16(sum3, x3, y3);
  69. sum4 = vfmaq_f16(sum4, x4, y4);
  70. sum5 = vfmaq_f16(sum5, x5, y5);
  71. sum6 = vfmaq_f16(sum6, x6, y6);
  72. sum7 = vfmaq_f16(sum7, x7, y7);
  73. }
  74. // TODO: F16 - better way to reduce this ?
  75. float16x8_t sum = vaddq_f16(sum0, sum1);
  76. sum = vaddq_f16(sum, sum2);
  77. sum = vaddq_f16(sum, sum3);
  78. sum = vaddq_f16(sum, sum4);
  79. sum = vaddq_f16(sum, sum5);
  80. sum = vaddq_f16(sum, sum6);
  81. sum = vaddq_f16(sum, sum7);
  82. sumf += sum[0] + sum[1] + sum[2] + sum[3] + sum[4] + sum[5] + sum[6] + sum[7];
  83. for (int j = n64; j < n64; j++) {
  84. sumf += src0[r*ncols + j]*src1[j];
  85. }
  86. dst[r] = sumf;
  87. }
  88. }
  89. void mul_mat_vec_f16_1(
  90. const __fp16 * src0,
  91. const __fp16 * src1,
  92. float * dst,
  93. int nrows,
  94. int ncols) {
  95. const int n32 = ncols & ~31;
  96. for (int r = 0; r < nrows; r++) {
  97. float sumf = 0.0;
  98. float16x8_t sum0 = vdupq_n_f16(0.0f);
  99. float16x8_t sum1 = vdupq_n_f16(0.0f);
  100. float16x8_t sum2 = vdupq_n_f16(0.0f);
  101. float16x8_t sum3 = vdupq_n_f16(0.0f);
  102. float16x8_t x0, x1, x2, x3;
  103. float16x8_t y0, y1, y2, y3;
  104. const __fp16 * restrict p0 = src0 + r*ncols;
  105. for (int i = 0; i < n32; i += 32) {
  106. x0 = vld1q_f16(p0 + i + 0 );
  107. x1 = vld1q_f16(p0 + i + 8 );
  108. x2 = vld1q_f16(p0 + i + 16);
  109. x3 = vld1q_f16(p0 + i + 24);
  110. y0 = vld1q_f16(src1 + i + 0 );
  111. y1 = vld1q_f16(src1 + i + 8 );
  112. y2 = vld1q_f16(src1 + i + 16);
  113. y3 = vld1q_f16(src1 + i + 24);
  114. sum0 = vfmaq_f16(sum0, x0, y0);
  115. sum1 = vfmaq_f16(sum1, x1, y1);
  116. sum2 = vfmaq_f16(sum2, x2, y2);
  117. sum3 = vfmaq_f16(sum3, x3, y3);
  118. }
  119. // reduce sum0..sum3 to sum0
  120. sum0 = vaddq_f16(sum0, sum1);
  121. sum2 = vaddq_f16(sum2, sum3);
  122. sum0 = vaddq_f16(sum0, sum2);
  123. // load sum0 into 2 float32x4_t
  124. float32x4_t sum0f32 = vcvt_f32_f16(vget_low_f16(sum0));
  125. float32x4_t sum1f32 = vcvt_f32_f16(vget_high_f16(sum0));
  126. // reduce sum0f32 and sum1f32 to sumf
  127. sum0f32 = vaddq_f32(sum0f32, sum1f32);
  128. float32x2_t sumf32 = vadd_f32(vget_low_f32(sum0f32), vget_high_f32(sum0f32));
  129. sumf = vget_lane_f32(sumf32, 0) + vget_lane_f32(sumf32, 1);
  130. //sumf = sum0[0] + sum0[1] + sum0[2] + sum0[3] + sum0[4] + sum0[5] + sum0[6] + sum0[7];
  131. for (int j = n32; j < n32; j++) {
  132. sumf += src0[r*ncols + j]*src1[j];
  133. }
  134. dst[r] = sumf;
  135. }
  136. }
  137. uint64_t get_time_us() {
  138. struct timeval tv;
  139. gettimeofday(&tv, NULL);
  140. return tv.tv_sec * 1000000 + tv.tv_usec;
  141. }
  142. int main(int argc, const char ** argv) {
  143. float * src0 = malloc(sizeof(float)*N*M);
  144. float * src1 = malloc(sizeof(float)*M);
  145. float * dst = malloc(sizeof(float)*N);
  146. //float * src0 = (float *)(aligned_alloc(64, sizeof(float)*N*M));
  147. //float * src1 = (float *)(aligned_alloc(64, sizeof(float)*M));
  148. //float * dst = (float *)(aligned_alloc(64, sizeof(float)*N));
  149. for (int i = 0; i < N*M; i++) {
  150. src0[i] = rand() / (float)RAND_MAX;
  151. }
  152. for (int i = 0; i < M; i++) {
  153. src1[i] = rand() / (float)RAND_MAX;
  154. }
  155. // convert src0 and src1 to __fp16
  156. __fp16 * src0_fp16 = (__fp16 *)(malloc(sizeof(__fp16)*N*M));
  157. __fp16 * src1_fp16 = (__fp16 *)(malloc(sizeof(__fp16)*M));
  158. {
  159. const uint64_t t_start = get_time_us();
  160. for (int i = 0; i < N*M; i++) {
  161. src0_fp16[i] = src0[i];
  162. //printf("%f %f\n", src0[i], src0_fp16[i]);
  163. //assert(!isnan(src0_fp16[i]));
  164. }
  165. for (int i = 0; i < M; i++) {
  166. src1_fp16[i] = src1[i];
  167. }
  168. const uint64_t t_end = get_time_us();
  169. printf("convert time: %f ms\n", (t_end - t_start) / 1000.0);
  170. }
  171. for (int i = 0; i < 16; ++i) {
  172. printf("%f %f\n", src0[i], src0_fp16[i]);
  173. }
  174. int method = 0;
  175. if (argc > 1) {
  176. method = atoi(argv[1]);
  177. }
  178. const int nIter = 1000;
  179. const clock_t start = clock();
  180. const uint64_t start_us = get_time_us();
  181. double iM = 1.0/M;
  182. double sum = 0.0f;
  183. for (int i = 0; i < nIter; i++) {
  184. if (method == 0) {
  185. mul_mat_vec_f32_0(src0, src1, dst, N, M);
  186. }
  187. if (method == 1) {
  188. mul_mat_vec_f16_0(src0_fp16, src1_fp16, dst, N, M);
  189. }
  190. if (method == 2) {
  191. mul_mat_vec_f16_1(src0_fp16, src1_fp16, dst, N, M);
  192. }
  193. }
  194. for (int i = 0; i < N; i++) {
  195. sum += dst[i]*iM;
  196. }
  197. {
  198. const clock_t end = clock();
  199. const uint64_t end_us = get_time_us();
  200. printf("%s: elapsed ticks: %ld\n", __func__, end - start);
  201. printf("%s: elapsed us: %llu / %f ms\n", __func__, end_us - start_us, (end_us - start_us) / 1000.0 / nIter);
  202. }
  203. printf("%f\n", sum);
  204. free(src0);
  205. free(src1);
  206. free(dst);
  207. free(src0_fp16);
  208. free(src1_fp16);
  209. return 0;
  210. }