forked from pi7mcrg2k/opcomplex
				
			
			You can not select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
		
			
		
			
				
					
					
						
							62 lines
						
					
					
						
							2.0 KiB
						
					
					
				
			
		
		
	
	
							62 lines
						
					
					
						
							2.0 KiB
						
					
					
				| #include <stdio.h>
 | |
| #include <stdlib.h>
 | |
| #include <time.h>
 | |
| #include <string.h>
 | |
| #include <arm_neon.h>
 | |
| #include "render.h"
 | |
| #define IDX(n) ((n) % 3)
 | |
| 
 | |
| void applySeparableGaussianBlur(float src[][MAT_SIZE], float dst[][MAT_SIZE], 
 | |
| 	 float kx[], float ky[]);
 | |
| int main() {
 | |
| 	float inputImage[MAT_SIZE][MAT_SIZE];
 | |
| 	Render(inputImage);
 | |
| 	float kernel[3][3] = {
 | |
| 		{1.0f/16, 2.0f/16, 1.0f/16},
 | |
| 		{2.0f/16, 4.0f/16, 2.0f/16},
 | |
| 		{1.0f/16, 2.0f/16, 1.0f/16}
 | |
| 	};
 | |
| 	float kx[4] = {1.0f/4, 1.0f/2, 1.0f/4, 0.0f}; // 防止越界多定义一个
 | |
| 	float ky[4] = {1.0f/4, 1.0f/2, 1.0f/4, 0.0f};
 | |
| 	float outputImage[MAT_SIZE][MAT_SIZE] = {0};
 | |
| 	clock_t start = clock();
 | |
| 	applySeparableGaussianBlur(inputImage, outputImage, kx, ky);
 | |
| 	clock_t end   = clock();
 | |
| 	printf("Time: %lf s\n", (double)(end-start) / CLOCKS_PER_SEC);
 | |
| 	Print(outputImage);
 | |
| }
 | |
| 
 | |
| void applySeparableGaussianBlur(float src[][MAT_SIZE], float dst[][MAT_SIZE], 
 | |
| 		float kx[], float ky[]) {
 | |
| 	int i, j;
 | |
| 	float buf[3][MAT_SIZE];
 | |
| 	float32x4_t kx_vec = vld1q_f32(kx);
 | |
| 	float32x4_t ky_vec = vld1q_f32(ky);
 | |
| 	// 计算前两行的行内卷积
 | |
| 	float32x4_t* left, *mid, *right, *result;
 | |
| 	for(i=0; i<2; i++)
 | |
| 	for(j=1; j<MAT_SIZE-1; j+=4) {
 | |
| 		left   = (float32x4_t*)&src[i][j-1];
 | |
| 		mid    = (float32x4_t*)&src[i][j];
 | |
| 		right  = (float32x4_t*)&src[i][j+1];
 | |
| 		result = (float32x4_t*)&buf[i][j];
 | |
| 		*result  = vmulq_lane_f32(*left,  vget_low_f32(kx_vec), 0);
 | |
| 		*result += vmulq_lane_f32(*mid,   vget_low_f32(kx_vec), 1);
 | |
| 		*result += vmulq_lane_f32(*right, vget_high_f32(kx_vec), 0);
 | |
| 	}
 | |
| 	// 利用buf进行累计
 | |
| 	for(i=1; i<MAT_SIZE-1; i++) {
 | |
| 		for(j=1; j<MAT_SIZE-1; j+=4) {
 | |
| 			left   = (float32x4_t*)&src[i+1][j-1];
 | |
| 			mid    = (float32x4_t*)&src[i+1][j];
 | |
| 			right  = (float32x4_t*)&src[i+1][j+1];
 | |
| 			result = (float32x4_t*)&buf[IDX(i+1)][j];
 | |
| 			*result  = vmulq_lane_f32(*left,  vget_low_f32(kx_vec), 0);
 | |
| 			*result += vmulq_lane_f32(*mid,   vget_low_f32(kx_vec), 1);
 | |
| 			*result += vmulq_lane_f32(*right, vget_high_f32(kx_vec), 0);
 | |
| 		}
 | |
| 		for(j=1; j<MAT_SIZE-1; j++)
 | |
| 			dst[i][j] = buf[IDX(i-1)][j]*ky[0]+buf[IDX(i)][j]*ky[1]+buf[IDX(i+1)][j]*ky[2];
 | |
| 	}
 | |
| }
 |