#include<stdio.h> #include<time.h> #include<iostream> #include<ostream> #include<stdlib.h> #include<arm_neon.h> #define SIZE 1024 void vector_add_optimized(float* A, float* B, float* C, int size){ for(int i=0;i<size;i+=4){ float32x4_t vecA=vld1q_f32(&A[i]); float32x4_t vecB=vld1q_f32(&B[i]); float32x4_t vecC=vaddq_f32(vecA,vecB); vst1q_f32(&C[i],vecC); } } int main(){ float *A=(float*)malloc(SIZE*sizeof(float)); float *B=(float*)malloc(SIZE*sizeof(float)); float *C=(float*)malloc(SIZE*sizeof(float)); for(int i=0;i<SIZE;++i){ A[i]=rand()%100; B[i]=rand()%100; } clock_t start=clock(); vector_add_optimized(A,B,C,SIZE); clock_t end=clock(); std::cout<<"³õʼÏòÁ¿¼Ó·¨Ê±¼ä£º"<<double(end-start)/CLOCKS_PER_SEC<<"Ãë"<<std::endl; }