forked from pi7mcrg2k/opcomplex
parent
b4bbaa9e9b
commit
ebcc723cf3
@ -0,0 +1,76 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <time.h>
|
||||
#include <string.h>
|
||||
#include <arm_neon.h>
|
||||
#define IDX(n) ((n) % 3)
|
||||
|
||||
void applySeparableGaussianBlur(float src[][100], float dst[][100],
|
||||
int h, int w, float kx[], float ky[]);
|
||||
void print(float a[][100], int h, int w);
|
||||
int main() {
|
||||
float inputImage[5][100] = {
|
||||
{1,2,3,4,5},
|
||||
{6,7,8,9,10},
|
||||
{11,12,13,14,15},
|
||||
{16,17,18,19,20},
|
||||
{21,22,23,24,25}
|
||||
};
|
||||
float kernel[3][3] = {
|
||||
{1.0f/16, 2.0f/16, 1.0f/16},
|
||||
{2.0f/16, 4.0f/16, 2.0f/16},
|
||||
{1.0f/16, 2.0f/16, 1.0f/16}
|
||||
};
|
||||
float kx[4] = {0.25f, 0.5f, 0.25f, 0.0f}; // 防止越界多定义一个
|
||||
float ky[4] = {0.25f, 0.5f, 0.25f, 0.0f};
|
||||
float outputImage[5][100] = {0};
|
||||
clock_t start = clock();
|
||||
applySeparableGaussianBlur(inputImage, outputImage, 5, 5, kx, ky);
|
||||
clock_t end = clock();
|
||||
printf("Time: %lf s\n", (double)(end-start) / CLOCKS_PER_SEC);
|
||||
print(outputImage, 5, 5);
|
||||
}
|
||||
|
||||
void applySeparableGaussianBlur(float src[][100], float dst[][100],
|
||||
int h, int w, float kx[], float ky[]) {
|
||||
int i, j;
|
||||
float buf[3][101] = {0};
|
||||
float32x4_t kx_vec = vld1q_f32(kx);
|
||||
float32x4_t ky_vec = vld1q_f32(ky);
|
||||
// 计算前两行的行内卷积
|
||||
for(i=0; i<2; i++)
|
||||
for(j=1; j<w-1; j+=4) {
|
||||
float32x4_t left = vld1q_f32(&src[i][j-1]);
|
||||
float32x4_t mid = vld1q_f32(&src[i][j]);
|
||||
float32x4_t right = vld1q_f32(&src[i][j+1]);
|
||||
float32x4_t result = vmulq_lane_f32(left, vget_low_f32(kx_vec), 0);
|
||||
result += vmulq_lane_f32(mid, vget_low_f32(kx_vec), 1);
|
||||
result += vmulq_lane_f32(right, vget_high_f32(kx_vec), 0);
|
||||
vst1q_f32(&buf[i][j], result);
|
||||
|
||||
}
|
||||
// 利用buf进行累计
|
||||
for(i=1; i<h-1; i++) {
|
||||
for(j=1; j<w-1; j+=4) {
|
||||
float32x4_t left = vld1q_f32(&src[i+1][j-1]);
|
||||
float32x4_t mid = vld1q_f32(&src[i+1][j]);
|
||||
float32x4_t right = vld1q_f32(&src[i+1][j+1]);
|
||||
float32x4_t result = vmulq_lane_f32(left, vget_low_f32(kx_vec), 0);
|
||||
result += vmulq_lane_f32(mid, vget_low_f32(kx_vec), 1);
|
||||
result += vmulq_lane_f32(right, vget_high_f32(kx_vec), 0);
|
||||
vst1q_f32(&buf[IDX(i+1)][j], result);
|
||||
}
|
||||
for(j=1; j<w-1; j++)
|
||||
dst[i][j] = buf[IDX(i-1)][j]*ky[0]+buf[IDX(i)][j]*ky[1]+buf[IDX(i+1)][j]*ky[2];
|
||||
}
|
||||
}
|
||||
|
||||
void print(float a[][100], int h, int w) {
|
||||
int i, j;
|
||||
for(i=0; i<h; i++) {
|
||||
for(j=0; j<w; j++) {
|
||||
printf("%5.1f ", a[i][j]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
}
|
Loading…
Reference in new issue