#include<stdio.h>
#include<time.h>
#include<iostream>
#include<ostream>
#include<stdlib.h>
#include<arm_neon.h>
#define SIZE 1024
 void vector_add_optimized(float* A, float* B, float* C, int size){
	for(int i=0;i<size;i+=4){
		float32x4_t vecA=vld1q_f32(&A[i]);
		float32x4_t vecB=vld1q_f32(&B[i]);
		float32x4_t vecC=vaddq_f32(vecA,vecB);
		vst1q_f32(&C[i],vecC); 
	} 	
}
int main(){
	float *A=(float*)malloc(SIZE*sizeof(float));
	float *B=(float*)malloc(SIZE*sizeof(float));
	float *C=(float*)malloc(SIZE*sizeof(float));
	for(int i=0;i<SIZE;++i){
		A[i]=rand()%100;
		B[i]=rand()%100;
	}
	clock_t start=clock();
	vector_add_optimized(A,B,C,SIZE);
	clock_t end=clock();
	std::cout<<"³õʼÏòÁ¿¼Ó·¨Ê±¼ä£º"<<double(end-start)/CLOCKS_PER_SEC<<"Ãë"<<std::endl; 
}