You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
pdc/算子优化系统2.cpp

55 lines
1.8 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

#include <stdio.h>
#include <stdlib.h>
#include <ctime>
#include <arm_neon.h>
// 定义向量大小
#define SIZE 1024
// 原始的向量加法函数
void vector_add(float* A, float* B, float* C, int size) {
for (int i = 0; i < size; i++) {
C[i] = A[i] + B[i];
}
}
// 使用NEON指令优化的向量加法函数
void vector_add_optimized(float* A, float* B, float* C, int size) {
int i;
for (i = 0; i < size - 3; i += 4) {
// 向量加载将A和B的4个连续元素加载到float32x4_t类型的向量中
float32x4_t a_vec = vld1q_f32(&A[i]);
float32x4_t b_vec = vld1q_f32(&B[i]);
float32x4_t c_vec = vaddq_f32(a_vec, b_vec);
// 将结果存储到C中
vst1q_f32(&C[i], c_vec);
}
for (; i < size; i++) {
C[i] = A[i] + B[i];
}
}
int main() {
float A[SIZE];
float B[SIZE];
float C[SIZE];
float C_optimized[SIZE];
// 利用for循环将A和B向量的每个元素随机初始化
for (int i = 0; i < SIZE; i++) {
A[i] = (float)(rand() % 100);
B[i] = (float)(rand() % 100);
}
// 测试原始向量加法函数的运行时间
clock_t start_time_original = clock();
vector_add(A, B, C, SIZE);
clock_t end_time_original = clock();
double elapsed_time_original = (double)(end_time_original - start_time_original) / CLOCKS_PER_SEC;
clock_t start_time_optimized = clock();
vector_add_optimized(A, B, C_optimized, SIZE);
clock_t end_time_optimized = clock();
double elapsed_time_optimized = (double)(end_time_optimized - start_time_optimized) / CLOCKS_PER_SEC;
printf("original time: %lf s\n", elapsed_time_original);
// 输出NEON优化后的向量加法的运行时间
printf("NEON optimized time: %lf s\n", elapsed_time_optimized);
return 0;
}