#include #include #include #include #define SIZE 1024 // 基础矩阵乘法函数 void matmul(float A[SIZE][SIZE], float B[SIZE][SIZE], float C[SIZE][SIZE], int n) { for (int i = 0; i < n; i++) { for (int j = 0; j < n; j++) { float sum = 0; for (int k = 0; k < n; k++) { sum += A[i][k] * B[k][j]; } C[i][j] = sum; } } } // NEON优化的矩阵乘法函数 void matmul_optimized(float A[SIZE][SIZE], float B[SIZE][SIZE], float C[SIZE][SIZE], int n) { int i, j, k; for (i = 0; i < n; i++) { for (j = 0; j < n; j += 4) { float32x4_t sum = vdupq_n_f32(0); for (k = 0; k < n; k++) { float32x4_t a = vld1q_f32(&A[i][k]); // Load 4 elements from A float32x4_t b = vld1q_f32(&B[k][j]); // Load 4 elements from B sum = vmlaq_f32(sum, a, b); // Multiply and accumulate } vst1q_f32(&C[i][j], sum); // Store the result } } } int main() { // 矩阵内存分配 float (*A)[SIZE] = malloc(SIZE * SIZE * sizeof(float)); float (*B)[SIZE] = malloc(SIZE * SIZE * sizeof(float)); float (*C)[SIZE] = malloc(SIZE * SIZE * sizeof(float)); if (A == NULL || B == NULL || C == NULL) { printf("Memory allocation failed!\n"); return 1; } // 初始化矩阵数据 srand(time(NULL)); // 初始化随机数种子 for (int i = 0; i < SIZE; i++) { for (int j = 0; j < SIZE; j++) { A[i][j] = rand() % 100; B[i][j] = rand() % 100; } } for (int i = 0; i < SIZE; i++) { for (int j = 0; j < SIZE; j++) { C[i][j] = 0; } } // 计时并输出基础矩阵乘法的运行时间 clock_t start, end; start = clock(); matmul(*A, *B, *C, SIZE); end = clock(); double time_spent = (double)(end - start) / CLOCKS_PER_SEC; printf("基础矩阵乘法的运行时间为: %lf 秒\n", time_spent); for (int i = 0; i < SIZE; i++) { for (int j = 0; j < SIZE; j++) { C[i][j] = 0; } } // 计时并输出NEON优化矩阵乘法的运行时间 start = clock(); matmul_optimized(*A, *B, *C, SIZE); end = clock(); time_spent = (double)(end - start) / CLOCKS_PER_SEC; printf("NEON优化矩阵乘法的运行时间为: %lf 秒\n", time_spent); free(A); free(B); free(C); return 0; }