|
|
@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
#include <stdio.h>
|
|
|
|
|
|
|
|
#include <arm_neon.h> //启用 NEON 指令
|
|
|
|
|
|
|
|
#include <ctime>
|
|
|
|
|
|
|
|
#include <stdlib.h>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#define SIZE 1024
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void vector_add_optimized(float* A, float* B, float* C, int size)
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
for(int i=0;i<SIZE;i+=4)
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
//加载A和B向量的4个浮点数到NEO寄存器
|
|
|
|
|
|
|
|
float32x4_t vecA=vld1q_f32(&A[i]);
|
|
|
|
|
|
|
|
float32x4_t vecB=vld1q_f32(&B[i]);
|
|
|
|
|
|
|
|
//执行向量加法
|
|
|
|
|
|
|
|
float32x4_t vecC=vaddq_f32(vecA,vecB);
|
|
|
|
|
|
|
|
//将结果存储到C向量
|
|
|
|
|
|
|
|
vst1q_f32(&C[i],vecC);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int main()
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
srand(time(NULL));
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// 动态分配内存,并检查是否分配成功
|
|
|
|
|
|
|
|
float *A = (float *)malloc(SIZE * sizeof(float));
|
|
|
|
|
|
|
|
float *B = (float *)malloc(SIZE * sizeof(float));
|
|
|
|
|
|
|
|
float *C = (float *)malloc(SIZE * sizeof(float));
|
|
|
|
|
|
|
|
// 随机初始化,生成0到99之间的随机浮点数
|
|
|
|
|
|
|
|
for (int i = 0; i < SIZE; i++)
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
A[i] = (float)(rand() % 100) / 100.0f;
|
|
|
|
|
|
|
|
B[i] = (float)(rand() % 100) / 100.0f;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
clock_t start = clock();
|
|
|
|
|
|
|
|
vector_add_optimized(A, B, C, SIZE);
|
|
|
|
|
|
|
|
clock_t end = clock();
|
|
|
|
|
|
|
|
// 计算并输出向量加法的时间
|
|
|
|
|
|
|
|
double time_spent = double(end - start) / CLOCKS_PER_SEC;
|
|
|
|
|
|
|
|
printf("使用NEON 优化向量加法:\n当SIZE取%d时,初始向量加法时间:%lf秒\n", SIZE,time_spent);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// 释放动态分配的内存
|
|
|
|
|
|
|
|
free(A);free(B);free(C);
|
|
|
|
|
|
|
|
}
|