forked from pi7mcrg2k/opcomplex
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
62 lines
2.0 KiB
62 lines
2.0 KiB
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <time.h>
|
|
#include <string.h>
|
|
#include <arm_neon.h>
|
|
#include "render.h"
|
|
#define IDX(n) ((n) % 3)
|
|
|
|
void applySeparableGaussianBlur(float src[][MAT_SIZE], float dst[][MAT_SIZE],
|
|
float kx[], float ky[]);
|
|
int main() {
|
|
float inputImage[MAT_SIZE][MAT_SIZE];
|
|
Render(inputImage);
|
|
float kernel[3][3] = {
|
|
{1.0f/16, 2.0f/16, 1.0f/16},
|
|
{2.0f/16, 4.0f/16, 2.0f/16},
|
|
{1.0f/16, 2.0f/16, 1.0f/16}
|
|
};
|
|
float kx[4] = {1.0f/4, 1.0f/2, 1.0f/4, 0.0f}; // 防止越界多定义一个
|
|
float ky[4] = {1.0f/4, 1.0f/2, 1.0f/4, 0.0f};
|
|
float outputImage[MAT_SIZE][MAT_SIZE] = {0};
|
|
clock_t start = clock();
|
|
applySeparableGaussianBlur(inputImage, outputImage, kx, ky);
|
|
clock_t end = clock();
|
|
printf("Time: %lf s\n", (double)(end-start) / CLOCKS_PER_SEC);
|
|
Print(outputImage);
|
|
}
|
|
|
|
void applySeparableGaussianBlur(float src[][MAT_SIZE], float dst[][MAT_SIZE],
|
|
float kx[], float ky[]) {
|
|
int i, j;
|
|
float buf[3][MAT_SIZE];
|
|
float32x4_t kx_vec = vld1q_f32(kx);
|
|
float32x4_t ky_vec = vld1q_f32(ky);
|
|
// 计算前两行的行内卷积
|
|
float32x4_t* left, *mid, *right, *result;
|
|
for(i=0; i<2; i++)
|
|
for(j=1; j<MAT_SIZE-1; j+=4) {
|
|
left = (float32x4_t*)&src[i][j-1];
|
|
mid = (float32x4_t*)&src[i][j];
|
|
right = (float32x4_t*)&src[i][j+1];
|
|
result = (float32x4_t*)&buf[i][j];
|
|
*result = vmulq_lane_f32(*left, vget_low_f32(kx_vec), 0);
|
|
*result += vmulq_lane_f32(*mid, vget_low_f32(kx_vec), 1);
|
|
*result += vmulq_lane_f32(*right, vget_high_f32(kx_vec), 0);
|
|
}
|
|
// 利用buf进行累计
|
|
for(i=1; i<MAT_SIZE-1; i++) {
|
|
for(j=1; j<MAT_SIZE-1; j+=4) {
|
|
left = (float32x4_t*)&src[i+1][j-1];
|
|
mid = (float32x4_t*)&src[i+1][j];
|
|
right = (float32x4_t*)&src[i+1][j+1];
|
|
result = (float32x4_t*)&buf[IDX(i+1)][j];
|
|
*result = vmulq_lane_f32(*left, vget_low_f32(kx_vec), 0);
|
|
*result += vmulq_lane_f32(*mid, vget_low_f32(kx_vec), 1);
|
|
*result += vmulq_lane_f32(*right, vget_high_f32(kx_vec), 0);
|
|
}
|
|
for(j=1; j<MAT_SIZE-1; j++)
|
|
dst[i][j] = buf[IDX(i-1)][j]*ky[0]+buf[IDX(i)][j]*ky[1]+buf[IDX(i+1)][j]*ky[2];
|
|
}
|
|
}
|