矩阵乘加速实验

该实验为复现GEMM通用矩阵乘法

在这场实验中我将尝试各种优化算法,并记录下其运行性能对比。

这是作业,其中会有很多胡的地方,深究就完了,唉唉(bushi

实验过程

一、基础版本

算法代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <sys/time.h>
#include <math.h>

// 获取当前时间(毫秒)
double getCurrentTime() {
struct timeval tv;
gettimeofday(&tv, NULL);
return tv.tv_sec * 1000.0 + tv.tv_usec / 1000.0;
}

// 生成50-200范围内的随机矩阵
float* generateLargeMatrix(int n) {
float* matrix = (float*)malloc(n * n * sizeof(float));
if (matrix == NULL) {
printf("Memory allocation failed for %dx%d matrix!\n", n, n);
exit(1);
}

// 50-200范围内的随机浮点数
for (int i = 0; i < n * n; i++) {
matrix[i] = 50.0f + (float)(rand() % 15001) / 100.0f; // 50.00-200.00
}
return matrix;
}

// 优化后的CPU矩阵乘法(循环交换改善缓存局部性)
void optimized_matmul(const float* a, const float* b, float* c, int n) {
// 先将结果矩阵清零
for (int i = 0; i < n * n; i++) {
c[i] = 0.0f;
}

// 交换循环顺序: i-k-j 改善缓存局部性
for (int i = 0; i < n; i++) {
for (int k = 0; k < n; k++) {
float temp = a[i * n + k];
for (int j = 0; j < n; j++) {
c[i * n + j] += temp * b[k * n + j];
}
}
}
}

int main() {
const int n = 4096; // 4096x4096矩阵
printf("Testing 4096x4096 matrix multiplication (elements in 50-200 range)...\n");

// 初始化随机数种子
srand((unsigned int)time(NULL));

// 生成矩阵
printf("Generating matrices...\n");
double start = getCurrentTime();
float* A = generateLargeMatrix(n);
float* B = generateLargeMatrix(n);
double gen_time = getCurrentTime() - start;
printf("Matrix generation completed in %.2f ms\n", gen_time);

// 分配结果矩阵
float* C = (float*)malloc(n * n * sizeof(float));
if (C == NULL) {
printf("Memory allocation failed for result matrix!\n");
free(A);
free(B);
return 1;
}

// 执行矩阵乘法
printf("Performing matrix multiplication...\n");
start = getCurrentTime();
optimized_matmul(A, B, C, n);
double mult_time = getCurrentTime() - start;

// 计算FLOPs (浮点运算次数)
double flops = 2.0 * n * n * n; // 2n^3 FLOPs
double gflops = flops / (mult_time / 1000.0) / 1e9;

printf("\nPerformance results:\n");
printf(" Matrix size: %dx%d\n", n, n);
printf(" Matrix generation: %.2f ms\n", gen_time);
printf(" Multiplication time: %.2f ms (%.2f seconds)\n", mult_time, mult_time/1000.0);
printf(" Computational rate: %.2f GFLOP/s\n", gflops);
printf(" Memory usage: %.2f MB per matrix\n",
(n * n * sizeof(float)) / (1024.0 * 1024.0));

// 验证结果(抽样检查)
printf("\nVerifying results (sampling)...\n");
int sample_pos[3][2] = {{0,0}, {n/2,n/2}, {n-1,n-1}};
for (int i = 0; i < 3; i++) {
int row = sample_pos[i][0];
int col = sample_pos[i][1];

double sum = 0.0;
for (int k = 0; k < n; k++) {
sum += (double)A[row * n + k] * B[k * n + col];
}

printf(" C[%d][%d]: computed=%.2f, expected≈%.2f (error=%.2f%%)\n",
row, col, C[row * n + col], sum,
100.0*fabs(C[row * n + col] - sum)/fabs(sum));
}

// 释放内存
free(A);
free(B);
free(C);

return 0;
}

运行结果为

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
Testing 4096x4096 matrix multiplication (elements in 50-200 range)...
Generating matrices...
Matrix generation completed in 267.50 ms
Performing matrix multiplication...

Performance results:
Matrix size: 4096x4096
Matrix generation: 267.50 ms
Multiplication time: 127565.08 ms (127.57 seconds)
Computational rate: 1.08 GFLOP/s
Memory usage: 64.00 MB per matrix

Verifying results (sampling)...
C[0][0]: computed=58970436.00, expected≈58970403.72 (error=0.00%)
C[2048][2048]: computed=59248904.00, expected≈59248936.34 (error=0.00%)
C[4095][4095]: computed=58616448.00, expected≈58616387.00 (error=0.00%)

从测试结果来看,虽然计算正确,但性能只有1.08 GFLOP/s,这明显低于现代CPU的理论算力。

不利于人工智能时代的大规模计算,所以将在接下来进行优化

二、1*4版本

1×4循环展开优化的主要优势:

  1. 减少循环开销:每次迭代计算4个元素,减少循环控制开销
  2. 提高指令级并行:编译器可以更好地利用CPU的流水线和SIMD指令
  3. 改善数据局部性:对矩阵B的访问模式更连续,提高缓存利用率
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <sys/time.h>
#include <math.h>

// 获取当前时间(毫秒)
double getCurrentTime() {
struct timeval tv;
gettimeofday(&tv, NULL);
return tv.tv_sec * 1000.0 + tv.tv_usec / 1000.0;
}

// 原始矩阵乘法
void matmul_original(const float* A, const float* B, float* C, int M, int N, int K) {
for (int m = 0; m < M; m++) {
for (int n = 0; n < N; n++) {
float sum = 0.0f;
for (int k = 0; k < K; k++) {
sum += A[m * K + k] * B[k * N + n];
}
C[m * N + n] = sum;
}
}
}

// 1×4循环展开优化
void matmul_unrolled_1x4(const float* A, const float* B, float* C, int M, int N, int K) {
for (int m = 0; m < M; m++) {
for (int n = 0; n < N; n += 4) {
// 初始化累加器
float sum0 = 0.0f;
float sum1 = 0.0f;
float sum2 = 0.0f;
float sum3 = 0.0f;

// 确保不会越界
int n_end = (n + 4 <= N) ? n + 4 : N;

for (int k = 0; k < K; k++) {
float a = A[m * K + k];
sum0 += a * B[k * N + (n + 0)];
if (n + 1 < n_end) sum1 += a * B[k * N + (n + 1)];
if (n + 2 < n_end) sum2 += a * B[k * N + (n + 2)];
if (n + 3 < n_end) sum3 += a * B[k * N + (n + 3)];
}

C[m * N + (n + 0)] = sum0;
if (n + 1 < n_end) C[m * N + (n + 1)] = sum1;
if (n + 2 < n_end) C[m * N + (n + 2)] = sum2;
if (n + 3 < n_end) C[m * N + (n + 3)] = sum3;
}
}
}

// 生成随机矩阵
float* generateRandomMatrix(int rows, int cols) {
float* matrix = (float*)malloc(rows * cols * sizeof(float));
for (int i = 0; i < rows * cols; i++) {
matrix[i] = (float)rand() / RAND_MAX * 2.0f - 1.0f; // -1到1之间的随机数
}
return matrix;
}

// 验证结果
int verifyResults(const float* C1, const float* C2, int M, int N) {
for (int i = 0; i < M * N; i++) {
if (fabs(C1[i] - C2[i]) > 1e-5) {
printf("Verification failed at index %d: %f vs %f\n", i, C1[i], C2[i]);
return 0;
}
}
return 1;
}

int main() {
const int sizes[] = {200, 400, 600, 800};
const int num_sizes = sizeof(sizes) / sizeof(sizes[0]);

double original_times[num_sizes];
double optimized_times[num_sizes];

srand(time(NULL));

for (int i = 0; i < num_sizes; i++) {
int size = sizes[i];
printf("Testing size: %d x %d\n", size, size);

float* A = generateRandomMatrix(size, size);
float* B = generateRandomMatrix(size, size);
float* C1 = (float*)malloc(size * size * sizeof(float));
float* C2 = (float*)malloc(size * size * sizeof(float));

// 测试原始版本
double start = getCurrentTime();
matmul_original(A, B, C1, size, size, size);
original_times[i] = getCurrentTime() - start;

// 测试优化版本
start = getCurrentTime();
matmul_unrolled_1x4(A, B, C2, size, size, size);
optimized_times[i] = getCurrentTime() - start;

// 验证结果
if (!verifyResults(C1, C2, size, size)) {
printf("Result mismatch!\n");
free(A); free(B); free(C1); free(C2);
return 1;
}

// 计算GFLOPS
double flops = 2.0 * size * size * size;
double original_gflops = (flops / (original_times[i] / 1000.0)) / 1e9;
double optimized_gflops = (flops / (optimized_times[i] / 1000.0)) / 1e9;

printf("Original: %.2f ms (%.2f GFLOPS)\n", original_times[i], original_gflops);
printf("Optimized: %.2f ms (%.2f GFLOPS)\n", optimized_times[i], optimized_gflops);
printf("Speedup: %.2fx\n\n", original_times[i] / optimized_times[i]);

free(A); free(B); free(C1); free(C2);
}

// 生成性能对比图数据
printf("\nPerformance comparison data for plotting:\n");
printf("Size\tOriginal GFLOPS\tOptimized GFLOPS\n");
for (int i = 0; i < num_sizes; i++) {
double original_gflops = (2.0 * sizes[i] * sizes[i] * sizes[i] / (original_times[i] / 1000.0)) / 1e9;
double optimized_gflops = (2.0 * sizes[i] * sizes[i] * sizes[i] / (optimized_times[i] / 1000.0)) / 1e9;
printf("%d\t%.2f\t%.2f\n", sizes[i], original_gflops, optimized_gflops);
}

return 0;
}

运行结果是这样的

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
Testing size: 200 x 200
Original: 12.25 ms (1.31 GFLOPS)
Optimized: 6.92 ms (2.31 GFLOPS)
Speedup: 1.77x

Testing size: 400 x 400
Original: 99.33 ms (1.29 GFLOPS)
Optimized: 60.26 ms (2.12 GFLOPS)
Speedup: 1.65x

Testing size: 600 x 600
Original: 356.01 ms (1.21 GFLOPS)
Optimized: 215.04 ms (2.01 GFLOPS)
Speedup: 1.66x

Testing size: 800 x 800
Original: 841.57 ms (1.22 GFLOPS)
Optimized: 542.85 ms (1.89 GFLOPS)
Speedup: 1.55x


Performance comparison data for plotting:
Size Original GFLOPS Optimized GFLOPS
200 1.31 2.31
400 1.29 2.12
600 1.21 2.01
800 1.22 1.89

这是两个算法的对比图

image-20250326111759198

可以看到很明显会提高性能

三、1*4+寄存器优化版本

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <sys/time.h>

// 获取当前时间(毫秒)
double getCurrentTime() {
struct timeval tv;
gettimeofday(&tv, NULL);
return tv.tv_sec * 1000.0 + tv.tv_usec / 1000.0;
}

// 基础版本 (未优化)
void matmul_basic(float* A, float* B, float* C, int M, int N, int K) {
for (int m = 0; m < M; m++) {
for (int n = 0; n < N; n++) {
float sum = 0.0f;
for (int k = 0; k < K; k++) {
sum += A[m * K + k] * B[k * N + n];
}
C[m * N + n] = sum;
}
}
}

// 1×4循环展开版本
void matmul_unrolled_1x4(float* A, float* B, float* C, int M, int N, int K) {
for (int m = 0; m < M; m++) {
for (int n = 0; n < N; n += 4) {
float sum0 = 0.0f, sum1 = 0.0f, sum2 = 0.0f, sum3 = 0.0f;
for (int k = 0; k < K; k++) {
float a = A[m * K + k];
sum0 += a * B[k * N + (n + 0)];
sum1 += a * B[k * N + (n + 1)];
sum2 += a * B[k * N + (n + 2)];
sum3 += a * B[k * N + (n + 3)];
}
C[m * N + (n + 0)] = sum0;
C[m * N + (n + 1)] = sum1;
C[m * N + (n + 2)] = sum2;
C[m * N + (n + 3)] = sum3;
}
}
}

// 1×4循环展开+寄存器优化版本
void matmul_unrolled_registers(float* A, float* B, float* C, int M, int N, int K) {
for (int m = 0; m < M; m++) {
for (int n = 0; n < N; n += 4) {
register float sum0 = 0.0f, sum1 = 0.0f, sum2 = 0.0f, sum3 = 0.0f;
for (int k = 0; k < K; k++) {
register float a = A[m * K + k];
sum0 += a * B[k * N + (n + 0)];
sum1 += a * B[k * N + (n + 1)];
sum2 += a * B[k * N + (n + 2)];
sum3 += a * B[k * N + (n + 3)];
}
C[m * N + (n + 0)] = sum0;
C[m * N + (n + 1)] = sum1;
C[m * N + (n + 2)] = sum2;
C[m * N + (n + 3)] = sum3;
}
}
}
// 生成随机矩阵
float* generateRandomMatrix(int rows, int cols) {
float* matrix = (float*)malloc(rows * cols * sizeof(float));
for (int i = 0; i < rows * cols; i++) {
matrix[i] = (float)rand() / RAND_MAX * 2.0f - 1.0f; // -1到1之间的随机数
}
return matrix;
}

int main() {
const int sizes[] = {256, 512, 768, 1024};
const int num_sizes = sizeof(sizes) / sizeof(sizes[0]);

srand(time(NULL));

printf("%-10s %-12s %-12s %-12s %-10s\n",
"Size", "Basic(ms)", "Unrolled(ms)", "Registers(ms)", "Speedup");

for (int i = 0; i < num_sizes; i++) {
int size = sizes[i];
float* A = generateRandomMatrix(size, size);
float* B = generateRandomMatrix(size, size);
float* C1 = (float*)malloc(size * size * sizeof(float));
float* C2 = (float*)malloc(size * size * sizeof(float));
float* C3 = (float*)malloc(size * size * sizeof(float));

// 测试基础版本
double start = getCurrentTime();
matmul_basic(A, B, C1, size, size, size);
double time_basic = getCurrentTime() - start;

// 测试1×4展开版本
start = getCurrentTime();
matmul_unrolled_1x4(A, B, C2, size, size, size);
double time_unrolled = getCurrentTime() - start;

// 测试寄存器优化版本
start = getCurrentTime();
matmul_unrolled_registers(A, B, C3, size, size, size);
double time_registers = getCurrentTime() - start;

// 计算GFLOPS
double flops = 2.0 * size * size * size;
double gflops_basic = (flops / (time_basic / 1000.0)) / 1e9;
double gflops_unrolled = (flops / (time_unrolled / 1000.0)) / 1e9;
double gflops_registers = (flops / (time_registers / 1000.0)) / 1e9;

printf("%-10d %-12.2f %-12.2f %-12.2f %-10.2fx\n",
size, time_basic, time_unrolled, time_registers,
time_basic / time_registers);

free(A); free(B); free(C1); free(C2); free(C3);
}

return 0;
}

运行结果为

1
2
3
4
5
Size       Basic(ms)    Unrolled(ms) Registers(ms) Speedup   
256 26.11 13.77 10.17 2.57x
512 214.92 116.98 81.93 2.62x
768 728.96 408.31 304.76 2.39x
1024 2472.31 1100.44 922.10 2.68x

image-20250326112821711

和原始版本相比明显更好了,时间也加快好多

image-20250326164901881

这张是与1*4做对比,也上升了性能,但在这里我没有看到老师课件展示的极高的性能提升

Screenshot_2025-03-26-16-56-53-68_e39d2c7de19156b

所造成的差异可能是因为硬件

后来稍稍做了些修改

image-20250326182604326

这张图就可以很明显看出差异了

四、4*4版本

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <sys/time.h>
#include <math.h>

// 获取当前时间(毫秒)
double getCurrentTime() {
struct timeval tv;
gettimeofday(&tv, NULL);
return tv.tv_sec * 1000.0 + tv.tv_usec / 1000.0;
}

// 基础版本
void matmul_basic(const float* A, const float* B, float* C, int M, int N, int K) {
for (int m = 0; m < M; m++) {
for (int n = 0; n < N; n++) {
float sum = 0.0f;
for (int k = 0; k < K; k++) {
sum += A[m * K + k] * B[k * N + n];
}
C[m * N + n] = sum;
}
}
}

// 4×4分块优化版本
void matmul_4x4_block(const float* A, const float* B, float* C, int M, int N, int K) {
// 清零结果矩阵
for (int i = 0; i < M * N; i++) {
C[i] = 0.0f;
}

// 分块处理
const int BLOCK_SIZE = 4;
for (int m = 0; m < M; m += BLOCK_SIZE) {
for (int n = 0; n < N; n += BLOCK_SIZE) {
for (int k = 0; k < K; k += BLOCK_SIZE) {
// 处理当前4×4×4块
for (int mm = m; mm < m + BLOCK_SIZE && mm < M; mm++) {
for (int nn = n; nn < n + BLOCK_SIZE && nn < N; nn++) {
register float sum = C[mm * N + nn];
for (int kk = k; kk < k + BLOCK_SIZE && kk < K; kk++) {
sum += A[mm * K + kk] * B[kk * N + nn];
}
C[mm * N + nn] = sum;
}
}
}
}
}
}

// 生成随机矩阵
float* generateRandomMatrix(int rows, int cols) {
float* matrix = (float*)malloc(rows * cols * sizeof(float));
for (int i = 0; i < rows * cols; i++) {
matrix[i] = (float)rand() / RAND_MAX * 2.0f - 1.0f; // -1到1之间的随机数
}
return matrix;
}

// 验证结果
int verifyResults(const float* C1, const float* C2, int M, int N) {
for (int i = 0; i < M * N; i++) {
if (fabs(C1[i] - C2[i]) > 1e-5) {
printf("Verification failed at index %d: %f vs %f\n", i, C1[i], C2[i]);
return 0;
}
}
return 1;
}

int main() {
const int sizes[] = {256, 512, 768, 1024};
const int num_sizes = sizeof(sizes) / sizeof(sizes[0]);

srand(time(NULL));

printf("%-10s %-12s %-12s %-10s\n",
"Size", "Basic(ms)", "4x4(ms)", "Speedup");

for (int i = 0; i < num_sizes; i++) {
int size = sizes[i];
float* A = generateRandomMatrix(size, size);
float* B = generateRandomMatrix(size, size);
float* C1 = (float*)malloc(size * size * sizeof(float));
float* C2 = (float*)malloc(size * size * sizeof(float));

// 测试基础版本
double start = getCurrentTime();
matmul_basic(A, B, C1, size, size, size);
double time_basic = getCurrentTime() - start;

// 测试4×4分块版本
start = getCurrentTime();
matmul_4x4_block(A, B, C2, size, size, size);
double time_4x4 = getCurrentTime() - start;

// 验证结果
if (!verifyResults(C1, C2, size, size)) {
printf("Result mismatch!\n");
free(A); free(B); free(C1); free(C2);
return 1;
}

// 计算GFLOPS
double flops = 2.0 * size * size * size;
double gflops_basic = (flops / (time_basic / 1000.0)) / 1e9;
double gflops_4x4 = (flops / (time_4x4 / 1000.0)) / 1e9;

printf("%-10d %-12.2f %-12.2f %-10.2fx\n",
size, time_basic, time_4x4, time_basic / time_4x4);

free(A); free(B); free(C1); free(C2);
}

return 0;
}

这是运行结果

1
2
3
4
5
Size       Basic(ms)    4x4(ms)      Speedup   
256 62.41 68.61 0.91 x
512 555.36 615.05 0.90 x
768 1842.34 1207.17 1.53 x
1024 3907.56 2993.73 1.31 x

这是最终的图

image-20250405143401285

有些意外,4*4版本的性能并没有提高

五、4*4+寄存器优化版本

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <sys/time.h>
#include <math.h>

// 函数声明
double getCurrentTime();
void matmul_basic(const float* A, const float* B, float* C, int M, int N, int K);
void matmul_4x4_block(const float* A, const float* B, float* C, int M, int N, int K);
void matmul_4x4_block_register(const float* A, const float* B, float* C, int M, int N, int K);
float* generateRandomMatrix(int rows, int cols);
int verifyResults(const float* C1, const float* C2, int M, int N);

// 获取当前时间(毫秒)
double getCurrentTime() {
struct timeval tv;
gettimeofday(&tv, NULL);
return tv.tv_sec * 1000.0 + tv.tv_usec / 1000.0;
}

// 基础版本
void matmul_basic(const float* A, const float* B, float* C, int M, int N, int K) {
for (int m = 0; m < M; m++) {
for (int n = 0; n < N; n++) {
float sum = 0.0f;
for (int k = 0; k < K; k++) {
sum += A[m * K + k] * B[k * N + n];
}
C[m * N + n] = sum;
}
}
}

// 4×4分块版本
void matmul_4x4_block(const float* A, const float* B, float* C, int M, int N, int K) {
// 清零结果矩阵
for (int i = 0; i < M * N; i++) {
C[i] = 0.0f;
}

// 分块处理
const int BLOCK_SIZE = 4;
for (int m = 0; m < M; m += BLOCK_SIZE) {
for (int n = 0; n < N; n += BLOCK_SIZE) {
for (int k = 0; k < K; k += BLOCK_SIZE) {
// 处理当前4×4×4块
for (int mm = m; mm < m + BLOCK_SIZE && mm < M; mm++) {
for (int nn = n; nn < n + BLOCK_SIZE && nn < N; nn++) {
float sum = C[mm * N + nn];
for (int kk = k; kk < k + BLOCK_SIZE && kk < K; kk++) {
sum += A[mm * K + kk] * B[kk * N + nn];
}
C[mm * N + nn] = sum;
}
}
}
}
}
}

// 4×4分块+寄存器优化版本
void matmul_4x4_block_register(const float* A, const float* B, float* C, int M, int N, int K) {
// 清零结果矩阵
for (int i = 0; i < M * N; i++) {
C[i] = 0.0f;
}

// 分块处理
const int BLOCK_SIZE = 4;
for (int m = 0; m < M; m += BLOCK_SIZE) {
for (int n = 0; n < N; n += BLOCK_SIZE) {
for (int k = 0; k < K; k += BLOCK_SIZE) {
// 寄存器变量声明
register float a00, a01, a02, a03;
register float a10, a11, a12, a13;
register float a20, a21, a22, a23;
register float a30, a31, a32, a33;

register float b00, b01, b02, b03;
register float b10, b11, b12, b13;
register float b20, b21, b22, b23;
register float b30, b31, b32, b33;

register float c00, c01, c02, c03;
register float c10, c11, c12, c13;
register float c20, c21, c22, c23;
register float c30, c31, c32, c33;

// 加载当前块到寄存器
if (m + BLOCK_SIZE <= M && n + BLOCK_SIZE <= N && k + BLOCK_SIZE <= K) {
// 完整块处理
c00 = C[(m+0)*N + (n+0)]; c01 = C[(m+0)*N + (n+1)]; c02 = C[(m+0)*N + (n+2)]; c03 = C[(m+0)*N + (n+3)];
c10 = C[(m+1)*N + (n+0)]; c11 = C[(m+1)*N + (n+1)]; c12 = C[(m+1)*N + (n+2)]; c13 = C[(m+1)*N + (n+3)];
c20 = C[(m+2)*N + (n+0)]; c21 = C[(m+2)*N + (n+1)]; c22 = C[(m+2)*N + (n+2)]; c23 = C[(m+2)*N + (n+3)];
c30 = C[(m+3)*N + (n+0)]; c31 = C[(m+3)*N + (n+1)]; c32 = C[(m+3)*N + (n+2)]; c33 = C[(m+3)*N + (n+3)];

for (int kk = k; kk < k + BLOCK_SIZE; kk++) {
// 加载A的4×1块和B的1×4块
a00 = A[(m+0)*K + kk]; a10 = A[(m+1)*K + kk]; a20 = A[(m+2)*K + kk]; a30 = A[(m+3)*K + kk];
b00 = B[kk*N + (n+0)]; b01 = B[kk*N + (n+1)]; b02 = B[kk*N + (n+2)]; b03 = B[kk*N + (n+3)];

// 计算累加
c00 += a00 * b00; c01 += a00 * b01; c02 += a00 * b02; c03 += a00 * b03;
c10 += a10 * b00; c11 += a10 * b01; c12 += a10 * b02; c13 += a10 * b03;
c20 += a20 * b00; c21 += a20 * b01; c22 += a20 * b02; c23 += a20 * b03;
c30 += a30 * b00; c31 += a30 * b01; c32 += a30 * b02; c33 += a30 * b03;
}

// 存回结果
C[(m+0)*N + (n+0)] = c00; C[(m+0)*N + (n+1)] = c01; C[(m+0)*N + (n+2)] = c02; C[(m+0)*N + (n+3)] = c03;
C[(m+1)*N + (n+0)] = c10; C[(m+1)*N + (n+1)] = c11; C[(m+1)*N + (n+2)] = c12; C[(m+1)*N + (n+3)] = c13;
C[(m+2)*N + (n+0)] = c20; C[(m+2)*N + (n+1)] = c21; C[(m+2)*N + (n+2)] = c22; C[(m+2)*N + (n+3)] = c23;
C[(m+3)*N + (n+0)] = c30; C[(m+3)*N + (n+1)] = c31; C[(m+3)*N + (n+2)] = c32; C[(m+3)*N + (n+3)] = c33;
} else {
// 边界处理(不完整块)
for (int mm = m; mm < m + BLOCK_SIZE && mm < M; mm++) {
for (int nn = n; nn < n + BLOCK_SIZE && nn < N; nn++) {
register float sum = C[mm*N + nn];
for (int kk = k; kk < k + BLOCK_SIZE && kk < K; kk++) {
sum += A[mm*K + kk] * B[kk*N + nn];
}
C[mm*N + nn] = sum;
}
}
}
}
}
}
}

// 生成随机矩阵
float* generateRandomMatrix(int rows, int cols) {
float* matrix = (float*)malloc(rows * cols * sizeof(float));
for (int i = 0; i < rows * cols; i++) {
matrix[i] = (float)rand() / RAND_MAX * 2.0f - 1.0f; // -1到1之间的随机数
}
return matrix;
}

// 验证结果
int verifyResults(const float* C1, const float* C2, int M, int N) {
for (int i = 0; i < M * N; i++) {
if (fabs(C1[i] - C2[i]) > 1e-5) {
printf("Verification failed at index %d: %f vs %f\n", i, C1[i], C2[i]);
return 0;
}
}
return 1;
}

int main() {
const int sizes[] = {256, 512, 768, 1024};
const int num_sizes = sizeof(sizes) / sizeof(sizes[0]);

srand(time(NULL));

printf("%-10s %-12s %-12s %-12s %-10s\n",
"Size", "Basic(ms)", "4x4(ms)", "4x4+Reg(ms)", "Speedup");

for (int i = 0; i < num_sizes; i++) {
int size = sizes[i];
float* A = generateRandomMatrix(size, size);
float* B = generateRandomMatrix(size, size);
float* C1 = (float*)malloc(size * size * sizeof(float));
float* C2 = (float*)malloc(size * size * sizeof(float));
float* C3 = (float*)malloc(size * size * sizeof(float));

// 测试基础版本
double start = getCurrentTime();
matmul_basic(A, B, C1, size, size, size);
double time_basic = getCurrentTime() - start;

// 测试4×4分块版本
start = getCurrentTime();
matmul_4x4_block(A, B, C2, size, size, size);
double time_4x4 = getCurrentTime() - start;

// 测试4×4分块+寄存器版本
start = getCurrentTime();
matmul_4x4_block_register(A, B, C3, size, size, size);
double time_4x4_reg = getCurrentTime() - start;

// 验证结果
if (!verifyResults(C1, C2, size, size) || !verifyResults(C1, C3, size, size)) {
printf("Result mismatch!\n");
free(A); free(B); free(C1); free(C2); free(C3);
return 1;
}

// 计算GFLOPS
double flops = 2.0 * size * size * size;
double gflops_basic = (flops / (time_basic / 1000.0)) / 1e9;
double gflops_4x4 = (flops / (time_4x4 / 1000.0)) / 1e9;
double gflops_4x4_reg = (flops / (time_4x4_reg / 1000.0)) / 1e9;

printf("%-10d %-12.2f %-12.2f %-12.2f %-10.2fx\n",
size, time_basic, time_4x4, time_4x4_reg,
time_basic / time_4x4_reg);

free(A); free(B); free(C1); free(C2); free(C3);
}

return 0;
}

这是结果

1
2
3
4
5
Size       Basic(ms)    4x4(ms)      4x4+Reg(ms)  Speedup   
256 43.04 40.14 13.05 3.30 x
512 339.10 381.91 120.44 2.82 x
768 1109.89 1169.28 420.74 2.64 x
1024 3858.35 3125.46 1118.45 3.45 x

image-20250405153956703

可以看到加了寄存器的性能要高上好多。

六、Cache 优化版本

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <sys/time.h>
#include <math.h>
#include <stdint.h> // 用于size_t

#define BLOCK_SIZE 32

// 函数声明
double getCurrentTime();
void matmul_basic(const float* A, const float* B, float* C, int M, int N, int K);
void matmul_4x4_block_register(const float* A, const float* B, float* C, int M, int N, int K);
void matmul_cache_optimized(const float* A, const float* B, float* C, int M, int N, int K);
float* generateRandomMatrix(int rows, int cols);
int verifyResults(const float* C1, const float* C2, int M, int N);

// 获取当前时间(毫秒)
double getCurrentTime() {
struct timeval tv;
gettimeofday(&tv, NULL);
return tv.tv_sec * 1000.0 + tv.tv_usec / 1000.0;
}

// 基础矩阵乘法
void matmul_basic(const float* A, const float* B, float* C, int M, int N, int K) {
for (int m = 0; m < M; m++) {
for (int n = 0; n < N; n++) {
float sum = 0.0f;
for (int k = 0; k < K; k++) {
sum += A[m * K + k] * B[k * N + n];
}
C[m * N + n] = sum;
}
}
}

// 4×4分块+寄存器优化版本(完整实现)
void matmul_4x4_block_register(const float* A, const float* B, float* C, int M, int N, int K) {
for (int i = 0; i < M * N; i++) C[i] = 0.0f;

const int block = 4;
for (int m = 0; m < M; m += block) {
for (int n = 0; n < N; n += block) {
for (int k = 0; k < K; k += block) {
for (int mm = m; mm < m + block && mm < M; mm++) {
for (int nn = n; nn < n + block && nn < N; nn++) {
register float sum = C[mm * N + nn];
for (int kk = k; kk < k + block && kk < K; kk++) {
sum += A[mm * K + kk] * B[kk * N + nn];
}
C[mm * N + nn] = sum;
}
}
}
}
}
}

// Cache优化版本(完整实现)
void matmul_cache_optimized(const float* A, const float* B, float* C, int M, int N, int K) {
for (int i = 0; i < M * N; i++) C[i] = 0.0f;

for (int m = 0; m < M; m += BLOCK_SIZE) {
for (int k = 0; k < K; k += BLOCK_SIZE) {
for (int n = 0; n < N; n += BLOCK_SIZE) {
for (int mm = m; mm < m + BLOCK_SIZE && mm < M; mm++) {
for (int kk = k; kk < k + BLOCK_SIZE && kk < K; kk++) {
register float a = A[mm * K + kk];
for (int nn = n; nn < n + BLOCK_SIZE && nn < N; nn++) {
C[mm * N + nn] += a * B[kk * N + nn];
}
}
}
}
}
}
}

// 生成随机矩阵
float* generateRandomMatrix(int rows, int cols) {
size_t size = (size_t)rows * cols;
float* matrix = (float*)malloc(size * sizeof(float));
if (matrix == NULL) {
fprintf(stderr, "内存分配失败\n");
exit(EXIT_FAILURE);
}

for (size_t i = 0; i < size; i++) {
matrix[i] = (float)rand() / RAND_MAX * 2.0f - 1.0f;
}
return matrix;
}

// 验证结果
int verifyResults(const float* C1, const float* C2, int M, int N) {
for (int i = 0; i < M * N; i++) {
if (fabs(C1[i] - C2[i]) > 1e-5) {
printf("验证失败 at %d: %f vs %f\n", i, C1[i], C2[i]);
return 0;
}
}
return 1;
}

int main() {
const int sizes[] = {256, 512, 768, 1024};
const int num_sizes = sizeof(sizes) / sizeof(sizes[0]);

srand((unsigned int)time(NULL)); // 显式类型转换

printf("%-10s %-12s %-12s %-12s %-10s\n",
"Size", "Basic(ms)", "4x4+Reg(ms)", "Cache(ms)", "Speedup");

for (int i = 0; i < num_sizes; i++) {
int size = sizes[i];
float* A = generateRandomMatrix(size, size);
float* B = generateRandomMatrix(size, size);

size_t matrix_size = (size_t)size * size;
float* C1 = (float*)malloc(matrix_size * sizeof(float));
float* C2 = (float*)malloc(matrix_size * sizeof(float));
float* C3 = (float*)malloc(matrix_size * sizeof(float));

if (!A || !B || !C1 || !C2 || !C3) {
fprintf(stderr, "内存分配失败\n");
exit(EXIT_FAILURE);
}

// 测试基础版本
double start = getCurrentTime();
matmul_basic(A, B, C1, size, size, size);
double time_basic = getCurrentTime() - start;

// 测试4×4分块+寄存器版本
start = getCurrentTime();
matmul_4x4_block_register(A, B, C2, size, size, size);
double time_4x4_reg = getCurrentTime() - start;

// 测试Cache优化版本
start = getCurrentTime();
matmul_cache_optimized(A, B, C3, size, size, size);
double time_cache = getCurrentTime() - start;

// 验证结果
if (!verifyResults(C1, C2, size, size) || !verifyResults(C1, C3, size, size)) {
printf("结果不匹配!\n");
free(A); free(B); free(C1); free(C2); free(C3);
return 1;
}

// 计算加速比
double speedup = time_basic / time_cache;

printf("%-10d %-12.2f %-12.2f %-12.2f %-10.2fx\n",
size, time_basic, time_4x4_reg, time_cache, speedup);

free(A); free(B); free(C1); free(C2); free(C3);
}

return 0;
}

这是结果

1
2
3
4
5
Size       Basic(ms)    4x4+Reg(ms)  Cache(ms)    Speedup   
256 38.00 37.00 37.78 1.01 x
512 306.46 299.64 334.56 0.92 x
768 1078.95 1128.33 1174.43 0.92 x
1024 3761.91 2955.08 2765.15 1.36 x

最终的性能对比图是这样的

image-20250406004017527

相比于原始版本有提升部分性能。

实验总结

在这场实验中,我见识到了不止是代码方面,还有思维、硬件方面可以进行加速优化,让我收获良多


矩阵乘加速实验
http://example.com/2025/03/26/矩阵乘加速实验/
作者
yuhua
发布于
2025年3月26日
许可协议