From 86d7ae55fd60bfc226af1fa2de230acfb8449904 Mon Sep 17 00:00:00 2001 From: pv3e4i5aj Date: Sat, 23 Nov 2024 18:09:20 +0800 Subject: [PATCH] Add task4.cpp --- task4.cpp | 92 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) create mode 100644 task4.cpp diff --git a/task4.cpp b/task4.cpp new file mode 100644 index 0000000..a3f5ac8 --- /dev/null +++ b/task4.cpp @@ -0,0 +1,92 @@ +#include +#include +#include //启用 NEON 指令 +#include + +#define SIZE 1024 + +void transposeMatrix(float** matrix, float** transposed, int size) { + for (int i = 0; i < size; i++) { + for (int j = 0; j < size; j++) { + transposed[j][i] = matrix[i][j]; + } + } +} + +void matmul(float** A, float** B, float** C, int n) { + for (int i = 0; i < n; i++) { + for (int j = 0; j < n; j++) { + float sum = 0.0f; + for (int k = 0; k < n; k += 4) + { + float32x4_t vecA, vecB, vecC; + if (k + 4 <= n) + { + // 加载A和B的4个元素,进行向量化计算 + vecA = vld1q_f32(&A[i][k]); + vecB = vld1q_f32(&B[j][k]); + // 向量化乘法并累加结果 + vecC = vmulq_f32(vecA, vecB); + sum += vgetq_lane_f32(vecC, 0) + vgetq_lane_f32(vecC, 1) + + vgetq_lane_f32(vecC, 2) + vgetq_lane_f32(vecC, 3); + } + else + { + // 处理剩余的元素 + for (int m = k; m < n; m++) + { + sum += A[i][m] * B[j][m]; + } + } + } + C[i][j] = sum; + } + } +} + +int main() { + srand(time(NULL)); + + float** A = (float**)malloc(SIZE * sizeof(float*)); + float** B = (float**)malloc(SIZE * sizeof(float*)); + float** C = (float**)malloc(SIZE * sizeof(float*)); + float** transposed = (float**)malloc(SIZE * sizeof(float*)); + + for (int i = 0; i < SIZE; i++) { + A[i] = (float*)malloc(SIZE * sizeof(float)); + B[i] = (float*)malloc(SIZE * sizeof(float)); + C[i] = (float*)malloc(SIZE * sizeof(float)); + transposed[i] = (float*)malloc(SIZE * sizeof(float)); + } + + for (int i = 0; i < SIZE; i++) { + for (int j = 0; j < SIZE; j++) { + A[i][j] = (float)(rand() % 100) / 100.0f; + B[i][j] = (float)(rand() % 100) / 100.0f; + } + } + + transposeMatrix(B, transposed, SIZE); + + clock_t start = clock(); + matmul(A, transposed, C, SIZE); + clock_t end = clock(); + + // 计算并输出矩阵乘法的时间 + double multiply_time_spent = double(end - start) / CLOCKS_PER_SEC; + printf("使用优化的向量乘法:\n当SIZE取%d时,优化的向量乘法时间:%lf秒\n", SIZE, multiply_time_spent); + + + for (int i = 0; i < SIZE; i++) { + free(A[i]); + free(B[i]); + free(C[i]); + free(transposed[i]); + } + free(A); + free(B); + free(C); + free(transposed); + + return 0; +} \ No newline at end of file