You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

110 lines
3.2 KiB

#include <stdio.h>
#include <time.h>
#include <arm_neon.h>
float kx[3] = { 0.25, 0.5, 0.25 };
float ky[3] = { 0.25, 0.5, 0.25 };
void applySeparableGaussianBlur_NEON(float src[5][5], float dst[5][5], int h, int w, float kx[3], float ky[3]) {
float buf[5][5] = { 0 };
float32x4_t kx_vec = vld1q_f32(kx);
for (int i = 0; i < h; i++) {
for (int j = 1; j < w - 1; j += 4) {
int end = (j + 4 < w) ? j + 4 : w;
for (int k = j; k < end; k += 4) {
if (k + 3 < w) {
float32x4_t left = vld1q_f32(&src[i][k - 1]);
float32x4_t mid = vld1q_f32(&src[i][k]);
float32x4_t right = vld1q_f32(&src[i][k + 1]);
float32x4_t result = vmulq_lane_f32(left, vget_low_f32(kx_vec), 0);
result = vmlaq_lane_f32(result, mid, vget_low_f32(kx_vec), 1);
result = vmlaq_lane_f32(result, right, vget_high_f32(kx_vec), 0);
vst1q_f32(&buf[i][k], result);
}
else {
for (int m = k; m < w && m < k + 4; m++) {
buf[i][m] = src[i][m - 1] * kx[0] + src[i][m] * kx[1] + src[i][m + 1] * kx[2];
}
}
}
}
}
float32x4_t ky_vec = vld1q_f32(ky);
for (int i = 1; i < h - 1; i++) {
for (int j = 0; j < w; j++) {
if (j < w - 1) {
if (i + 2 < h) {
float32x4_t top = vld1q_f32(&buf[i - 1][j]);
float32x4_t mid = vld1q_f32(&buf[i][j]);
float32x4_t bottom = vld1q_f32(&buf[i + 1][j]);
float32x4_t result = vmulq_lane_f32(top, vget_low_f32(ky_vec), 0);
result = vmlaq_lane_f32(result, mid, vget_low_f32(ky_vec), 1);
result = vmlaq_lane_f32(result, bottom, vget_high_f32(ky_vec), 0);
vst1q_f32(&dst[i][j], result);
}
else {
dst[i][j] = buf[i - 1][j] * ky[0] + buf[i][j] * ky[1] + buf[i + 1][j] * ky[2];
}
}
else {
dst[i][w - 1] = buf[i][w - 1];
}
}
}
for (int i = 0; i < h; i++) {
dst[i][0] = 0;
dst[i][w - 1] = 0;
}
for (int j = 0; j < w; j++) {
dst[0][j] = 0;
dst[h - 1][j] = 0;
}
}
int main() {
float src[5][5] __attribute__((aligned(16))) = {
{1, 2, 3, 4, 5},
{6, 7, 8, 9, 10},
{11, 12, 13, 14, 15},
{16, 17, 18, 19, 20},
{21, 22, 23, 24, 25}
};
float dst[5][5] = { 0 };
clock_t start = clock();
applySeparableGaussianBlur_NEON(src, dst, 5, 5, kx, ky);
clock_t end = clock();
printf("ģ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ͼ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>:\n");
for (int i = 0; i < 5; i++) {
for (int j = 0; j < 5; j++) {
printf("%.2f ", dst[i][j]);
}
printf("\n");
}
double time_taken = (double)(end - start) / CLOCKS_PER_SEC;
printf("<EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʱ<EFBFBD><EFBFBD>: %e <20><>\n", time_taken);
return 0;
}