#include #include #include //启用 NEON 指令 #include #define SIZE 1024 void transposeMatrix(float** matrix, float** transposed, int size) { for (int i = 0; i < size; i++) { for (int j = 0; j < size; j++) { transposed[j][i] = matrix[i][j]; } } } void matmul(float** A, float** B, float** C, int n) { for (int i = 0; i < n; i++) { for (int j = 0; j < n; j++) { float sum = 0.0f; for (int k = 0; k < n; k += 4) { float32x4_t vecA, vecB, vecC; if (k + 4 <= n) { // 加载A和B的4个元素,进行向量化计算 vecA = vld1q_f32(&A[i][k]); vecB = vld1q_f32(&B[j][k]); // 向量化乘法并累加结果 vecC = vmulq_f32(vecA, vecB); sum += vgetq_lane_f32(vecC, 0) + vgetq_lane_f32(vecC, 1) + vgetq_lane_f32(vecC, 2) + vgetq_lane_f32(vecC, 3); } else { // 处理剩余的元素 for (int m = k; m < n; m++) { sum += A[i][m] * B[j][m]; } } } C[i][j] = sum; } } } int main() { srand(time(NULL)); float** A = (float**)malloc(SIZE * sizeof(float*)); float** B = (float**)malloc(SIZE * sizeof(float*)); float** C = (float**)malloc(SIZE * sizeof(float*)); float** transposed = (float**)malloc(SIZE * sizeof(float*)); for (int i = 0; i < SIZE; i++) { A[i] = (float*)malloc(SIZE * sizeof(float)); B[i] = (float*)malloc(SIZE * sizeof(float)); C[i] = (float*)malloc(SIZE * sizeof(float)); transposed[i] = (float*)malloc(SIZE * sizeof(float)); } for (int i = 0; i < SIZE; i++) { for (int j = 0; j < SIZE; j++) { A[i][j] = (float)(rand() % 100) / 100.0f; B[i][j] = (float)(rand() % 100) / 100.0f; } } transposeMatrix(B, transposed, SIZE); clock_t start = clock(); matmul(A, transposed, C, SIZE); clock_t end = clock(); // 计算并输出矩阵乘法的时间 double multiply_time_spent = double(end - start) / CLOCKS_PER_SEC; printf("使用优化的向量乘法:\n当SIZE取%d时,优化的向量乘法时间:%lf秒\n", SIZE, multiply_time_spent); for (int i = 0; i < SIZE; i++) { free(A[i]); free(B[i]); free(C[i]); free(transposed[i]); } free(A); free(B); free(C); free(transposed); return 0; }