#include <stdio.h> #include <stdlib.h> #include <time.h> #include <string.h> #include <arm_neon.h> #include "render.h" #define IDX(n) ((n) % 3) void applySeparableGaussianBlur(float src[][MAT_SIZE], float dst[][MAT_SIZE], float kx[], float ky[]); int main() { float inputImage[MAT_SIZE][MAT_SIZE]; Render(inputImage); float kernel[3][3] = { {1.0f/16, 2.0f/16, 1.0f/16}, {2.0f/16, 4.0f/16, 2.0f/16}, {1.0f/16, 2.0f/16, 1.0f/16} }; float kx[4] = {1.0f/4, 1.0f/2, 1.0f/4, 0.0f}; // 防止越界多定义一个 float ky[4] = {1.0f/4, 1.0f/2, 1.0f/4, 0.0f}; float outputImage[MAT_SIZE][MAT_SIZE] = {0}; clock_t start = clock(); applySeparableGaussianBlur(inputImage, outputImage, kx, ky); clock_t end = clock(); printf("Time: %lf s\n", (double)(end-start) / CLOCKS_PER_SEC); Print(outputImage); } void applySeparableGaussianBlur(float src[][MAT_SIZE], float dst[][MAT_SIZE], float kx[], float ky[]) { int i, j; float buf[3][MAT_SIZE]; float32x4_t kx_vec = vld1q_f32(kx); float32x4_t ky_vec = vld1q_f32(ky); // 计算前两行的行内卷积 float32x4_t* left, *mid, *right, *result; for(i=0; i<2; i++) for(j=1; j<MAT_SIZE-1; j+=4) { left = (float32x4_t*)&src[i][j-1]; mid = (float32x4_t*)&src[i][j]; right = (float32x4_t*)&src[i][j+1]; result = (float32x4_t*)&buf[i][j]; *result = vmulq_lane_f32(*left, vget_low_f32(kx_vec), 0); *result += vmulq_lane_f32(*mid, vget_low_f32(kx_vec), 1); *result += vmulq_lane_f32(*right, vget_high_f32(kx_vec), 0); } // 利用buf进行累计 for(i=1; i<MAT_SIZE-1; i++) { for(j=1; j<MAT_SIZE-1; j+=4) { left = (float32x4_t*)&src[i+1][j-1]; mid = (float32x4_t*)&src[i+1][j]; right = (float32x4_t*)&src[i+1][j+1]; result = (float32x4_t*)&buf[IDX(i+1)][j]; *result = vmulq_lane_f32(*left, vget_low_f32(kx_vec), 0); *result += vmulq_lane_f32(*mid, vget_low_f32(kx_vec), 1); *result += vmulq_lane_f32(*right, vget_high_f32(kx_vec), 0); } for(j=1; j<MAT_SIZE-1; j++) dst[i][j] = buf[IDX(i-1)][j]*ky[0]+buf[IDX(i)][j]*ky[1]+buf[IDX(i+1)][j]*ky[2]; } }