diff --git a/blur_3.c b/blur_3.c new file mode 100644 index 0000000..15cbb12 --- /dev/null +++ b/blur_3.c @@ -0,0 +1,110 @@ +#include +#include +#include + +float kx[3] = { 0.25, 0.5, 0.25 }; +float ky[3] = { 0.25, 0.5, 0.25 }; + +void applySeparableGaussianBlur_NEON(float src[5][5], float dst[5][5], int h, int w, float kx[3], float ky[3]) { + float buf[5][5] = { 0 }; + + float32x4_t kx_vec = vld1q_f32(kx); + + for (int i = 0; i < h; i++) { + for (int j = 1; j < w - 1; j += 4) { + int end = (j + 4 < w) ? j + 4 : w; + for (int k = j; k < end; k += 4) { + if (k + 3 < w) { + + float32x4_t left = vld1q_f32(&src[i][k - 1]); + float32x4_t mid = vld1q_f32(&src[i][k]); + float32x4_t right = vld1q_f32(&src[i][k + 1]); + + + float32x4_t result = vmulq_lane_f32(left, vget_low_f32(kx_vec), 0); + result = vmlaq_lane_f32(result, mid, vget_low_f32(kx_vec), 1); + result = vmlaq_lane_f32(result, right, vget_high_f32(kx_vec), 0); + + + vst1q_f32(&buf[i][k], result); + } + else { + + for (int m = k; m < w && m < k + 4; m++) { + buf[i][m] = src[i][m - 1] * kx[0] + src[i][m] * kx[1] + src[i][m + 1] * kx[2]; + } + } + } + } + } + + + float32x4_t ky_vec = vld1q_f32(ky); + + for (int i = 1; i < h - 1; i++) { + for (int j = 0; j < w; j++) { + if (j < w - 1) { + if (i + 2 < h) { + + float32x4_t top = vld1q_f32(&buf[i - 1][j]); + float32x4_t mid = vld1q_f32(&buf[i][j]); + float32x4_t bottom = vld1q_f32(&buf[i + 1][j]); + + + float32x4_t result = vmulq_lane_f32(top, vget_low_f32(ky_vec), 0); + result = vmlaq_lane_f32(result, mid, vget_low_f32(ky_vec), 1); + result = vmlaq_lane_f32(result, bottom, vget_high_f32(ky_vec), 0); + + + vst1q_f32(&dst[i][j], result); + } + else { + + dst[i][j] = buf[i - 1][j] * ky[0] + buf[i][j] * ky[1] + buf[i + 1][j] * ky[2]; + } + } + else { + + dst[i][w - 1] = buf[i][w - 1]; + } + } + } + + for (int i = 0; i < h; i++) { + dst[i][0] = 0; + dst[i][w - 1] = 0; + } + for (int j = 0; j < w; j++) { + dst[0][j] = 0; + dst[h - 1][j] = 0; + } +} + +int main() { + float src[5][5] __attribute__((aligned(16))) = { + {1, 2, 3, 4, 5}, + {6, 7, 8, 9, 10}, + {11, 12, 13, 14, 15}, + {16, 17, 18, 19, 20}, + {21, 22, 23, 24, 25} + }; + + float dst[5][5] = { 0 }; + + clock_t start = clock(); + applySeparableGaussianBlur_NEON(src, dst, 5, 5, kx, ky); + clock_t end = clock(); + + printf("模糊后的图像矩阵:\n"); + for (int i = 0; i < 5; i++) { + for (int j = 0; j < 5; j++) { + printf("%.2f ", dst[i][j]); + } + printf("\n"); + } + + double time_taken = (double)(end - start) / CLOCKS_PER_SEC; + printf("运行时间: %e 秒\n", time_taken); + + return 0; +} \ No newline at end of file