From 32394ed488727e3b762b4e3ccf45d40c9b1b1c7a Mon Sep 17 00:00:00 2001 From: pk29n3fu4 <2434647226@qq.com> Date: Fri, 29 Nov 2024 20:07:14 +0800 Subject: [PATCH] ADD file via upload --- t4.cpp | 69 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 t4.cpp diff --git a/t4.cpp b/t4.cpp new file mode 100644 index 0000000..0cf69ac --- /dev/null +++ b/t4.cpp @@ -0,0 +1,69 @@ +#include +#include +#include +#include + +#define SIZE 1024 + +// NEON优化的稠密矩阵乘法函数 +void matmul_optimized(float** A, float** B, float** C, int n) { + for (int i = 0; i < n; i++) { + for (int j = 0; j < n; j++) { + float32x4_t sum = vdupq_n_f32(0.0); + for (int k = 0; k < n; k += 4) { + float32x4_t a = vld1q_f32(A[i] + k); + float32x4_t b = vld1q_f32(B[k] + j); + sum = vmlaq_f32(sum, a, b); + } + // 将结果累加到C[i][j] + float* sumPtr = (float*)∑ + for (int l = 0; l < 4; l++) { + C[i][j] += sumPtr[l]; + } + } + } +} + +int main() { + // 分配内存并初始化矩阵 + float** A = (float**)malloc(SIZE * sizeof(float*)); + float** B = (float**)malloc(SIZE * sizeof(float*)); + float** C = (float**)malloc(SIZE * sizeof(float*)); + + for (int i = 0; i < SIZE; i++) { + A[i] = (float*)malloc(SIZE * sizeof(float)); + B[i] = (float*)malloc(SIZE * sizeof(float)); + C[i] = (float*)malloc(SIZE * sizeof(float)); + } + + // 随机初始化矩阵A和B + srand((unsigned int)time(NULL)); + for (int i = 0; i < SIZE; i++) { + for (int j = 0; j < SIZE; j++) { + A[i][j] = rand() % 100; + B[i][j] = rand() % 100; + } + } + + // 计时开始 + clock_t start = clock(); + // 执行NEON优化的矩阵乘法 + matmul_optimized(A, B, C, SIZE); + // 计时结束 + clock_t end = clock(); + + // 输出运行时间 + printf("NEON优化矩阵乘法耗时: %lf 毫秒\n", 1000.0 * (end - start) / CLOCKS_PER_SEC); + + // 释放内存 + for (int i = 0; i < SIZE; i++) { + free(A[i]); + free(B[i]); + free(C[i]); + } + free(A); + free(B); + free(C); + + return 0; +}