123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133 |
- #include <stdio.h>
- #include <assert.h>
- #include <stdlib.h>
- #include <time.h>
- const int N = 1 << 14;
- const int M = 1 << 14;
- void mul_mat_vec_f32_0(
- const float * src0,
- const float * src1,
- float * dst,
- unsigned nrows,
- unsigned ncols) {
- for (unsigned i = 0; i < nrows; i++) {
- float sum = 0.0f;
- for (unsigned j = 0; j < ncols; j++) {
- sum += src0[i*ncols + j]*src1[j];
- }
- dst[i] = sum;
- }
- }
- #if defined(_MSC_VER)
- typedef float __declspec(align(32)) afloat;
- #else
- typedef float afloat __attribute__((__aligned__(32)));
- #endif
- void mul_mat_vec_f32_1(
- const afloat *restrict src0,
- const afloat *restrict src1,
- afloat *restrict dst,
- unsigned nrows,
- unsigned ncols) {
- for (unsigned i = 0; i < nrows; i++) {
- const afloat * restrict row = src0 + i*ncols;
- const afloat * restrict col = src1;
- float sum = 0.0f;
- for (unsigned j = 0; j < ncols; j++) {
- sum += *row++ * *col++;
- }
- dst[i] = sum;
- //float sum[8] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
- //for (unsigned j = 0; j < ncols; j += 8) {
- // sum[0] += row[0]*col[0];
- // sum[1] += row[1]*col[1];
- // sum[2] += row[2]*col[2];
- // sum[3] += row[3]*col[3];
- // sum[4] += row[4]*col[4];
- // sum[5] += row[5]*col[5];
- // sum[6] += row[6]*col[6];
- // sum[7] += row[7]*col[7];
- // row += 8;
- // col += 8;
- //}
- //dst[i] = sum[0] + sum[1] + sum[2] + sum[3] + sum[4] + sum[5] + sum[6] + sum[7];
- }
- }
- void mul_mat_vec_f32_2(
- const void * src0,
- const void * src1,
- void * dst,
- unsigned nrows,
- unsigned ncols) {
- void * d = dst;
- for (unsigned i = 0; i < nrows; i++) {
- float sum = 0.0f;
- const char * row = (const char*)src0 + i*ncols*sizeof(float);
- const char * col = (const char*)src1;
- for (unsigned j = 0; j < ncols; j++) {
- sum += (*(float *)row) * (*(float *)col);
- row += sizeof(float);
- col += sizeof(float);
- }
- *(float *)d = sum;
- d = (char*)d + sizeof(float);
- }
- }
- #if defined(_MSC_VER)
- void* aligned_alloc(size_t alignment, size_t size) {
- return _aligned_malloc(size, alignment);
- }
- #endif
- int main(int argc, const char ** argv) {
- //float * src0 = malloc(sizeof(float)*N*M);
- //float * src1 = malloc(sizeof(float)*M);
- //float * dst = malloc(sizeof(float)*N);
- afloat * src0 = (float *)(aligned_alloc(32, sizeof(float)*N*M));
- afloat * src1 = (float *)(aligned_alloc(32, sizeof(float)*M));
- afloat * dst = (float *)(aligned_alloc(32, sizeof(float)*N));
- for (int i = 0; i < N*M; i++) {
- src0[i] = (afloat)i;
- }
- for (int i = 0; i < M; i++) {
- src1[i] = (afloat)i;
- }
- const int nIter = 10;
- const clock_t start = clock();
- double sum = 0.0f;
- for (int i = 0; i < nIter; i++) {
- //mul_mat_vec_f32_0(src0, src1, dst, N, M);
- mul_mat_vec_f32_1(src0, src1, dst, N, M);
- //mul_mat_vec_f32_2(src0, src1, dst, N, M);
- for (int i = 0; i < N; i++) {
- sum += dst[i];
- }
- }
- {
- const clock_t end = clock();
- printf("%s: elapsed ticks: %ld\n", __func__, end - start);
- }
- printf("%f\n", sum);
- return 0;
- }
|