From a8c42927e8ca56e61ea1fb7eef1c3f04af93e22f Mon Sep 17 00:00:00 2001 From: pu7icvtwj Date: Fri, 29 Nov 2024 16:06:17 +0800 Subject: [PATCH] ADD file via upload --- Untitled1.cpp | 130 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 130 insertions(+) create mode 100644 Untitled1.cpp diff --git a/Untitled1.cpp b/Untitled1.cpp new file mode 100644 index 0000000..b65004e --- /dev/null +++ b/Untitled1.cpp @@ -0,0 +1,130 @@ +#include +#include +#include +#include + +// 基础向量加法 +void vector_add_base(float *a, float *b, float *c, int n) { +for (int i = 0; i < n; i++) { +c[i] = a[i] + b[i]; +} +} + +// NEON优化向量加法 +void vector_add_neon(float *a, float *b, float *c, int n) { +int i; +for (i = 0; i <= n - 4; i += 4) { +float32x4_t va = vld1q_f32(&a[i]); +float32x4_t vb = vld1q_f32(&b[i]); +float32x4_t vc = vaddq_f32(va, vb); +vst1q_f32(&c[i], vc); +} +for (; i < n; i++) { +c[i] = a[i] + b[i]; +} +} + +// 基础矩阵乘法 +void matrix_multiply_base(float *A, float *B, float *C, int N) { +for (int i = 0; i < N; i++) { +for (int j = 0; j < N; j++) { +float sum = 0.0f; +for (int k = 0; k < N; k++) { +sum += A[i * N + k] * B[k * N + j]; +} +C[i * N + j] = sum; +} +} +} + +// NEON优化矩阵乘法 +void matrix_multiply_neon(float *A, float *B, float *C, int N) { +int i, j, k; +for (i = 0; i < N; i++) { +for (j = 0; j < N; j++) { +float32x4_t sum = vdupq_n_f32(0.0f); +for (k = 0; k <= N - 4; k += 4) { +float32x4_t va = vld1q_f32(&A[i * N + k]); +float32x4_t vb = vld1q_f32(&B[k * N + j]); +sum = vmlaq_f32(sum, va, vb); +} +C[i * N + j] = vgetq_lane_f32(sum, 0) + vgetq_lane_f32(sum, 1) + +vgetq_lane_f32(sum, 2) + vgetq_lane_f32(sum, 3); + +// 处理剩余的元素 +for (; k < N; k++) { +C[i * N + j] += A[i * N + k] * B[k * N + j]; +} +} +} +} + +// 初始化矩阵 +void initialize_matrix(float *matrix, int N) { +for (int i = 0; i < N * N; i++) { +matrix[i] = (float)(rand() % 100) / 100.0f; +} +} + +// 计算时间 +double get_time() { +struct timespec ts; +clock_gettime(CLOCK_MONOTONIC, &ts); +return ts.tv_sec + ts.tv_nsec * 1e-9; +} + +int main() { +int N = 1024; // 矩阵大小 +float *a, *b, *c, *A, *B, *C; + +// 分配内存 +a = (float *)aligned_alloc(16, N * sizeof(float)); +b = (float *)aligned_alloc(16, N * sizeof(float)); +c = (float *)aligned_alloc(16, N * sizeof(float)); +A = (float *)aligned_alloc(16, N * N * sizeof(float)); +B = (float *)aligned_alloc(16, N * N * sizeof(float)); +C = (float *)aligned_alloc(16, N * N * sizeof(float)); + +// 初始化向量和矩阵 +for (int i = 0; i < N; i++) { +a[i] = (float)(rand() % 100) / 100.0f; +b[i] = (float)(rand() % 100) / 100.0f; +} +initialize_matrix(A, N); +initialize_matrix(B, N); + +// 基础向量加法 +double start_time = get_time(); +vector_add_base(a, b, c, N); +double end_time = get_time(); +printf("基础向量加法时间: %.6f 秒\n", end_time - start_time); + +// NEON优化向量加法 +start_time = get_time(); +vector_add_neon(a, b, c, N); +end_time = get_time(); +printf("NEON优化向量加法时间: %.6f 秒\n", end_time - start_time); + +// 基础矩阵乘法 +start_time = get_time(); +matrix_multiply_base(A, B, C, N); +end_time = get_time(); +printf("基础矩阵乘法时间: %.6f 秒\n", end_time - start_time); + +// NEON优化矩阵乘法 +start_time = get_time(); +matrix_multiply_neon(A, B, C, N); +end_time = get_time(); +printf("NEON优化矩阵乘法时间: %.6f 秒\n", end_time - start_time); + +// 释放内存 +free(a); +free(b); +free(c); +free(A); +free(B); +free(C); + +return 0; +} +