diff --git a/GaussianBlur_Step3_NEON.c b/GaussianBlur_Step3_NEON.c new file mode 100644 index 0000000..8f7cd2a --- /dev/null +++ b/GaussianBlur_Step3_NEON.c @@ -0,0 +1,73 @@ + +#include +#include +#include + +#define H 5 +#define W 5 + +void applySeparableGaussianBlur(float src[H][W], float dst[H][W], int h, int w, float kx[3], float ky[3]) { + float buf[H][W] = {0}; + + // Load kernel into NEON registers + float32x4_t kx_vec = vld1q_dup_f32(kx); + float32x4_t ky_vec = vld1q_dup_f32(ky); + + // Apply horizontal kernel + for (int i = 0; i < h; i++) { + for (int j = 1; j < w - 1; j += 4) { + float32x4_t src_left = vld1q_f32(&src[i][j - 1]); + float32x4_t src_mid = vld1q_f32(&src[i][j]); + float32x4_t src_right = vld1q_f32(&src[i][j + 1]); + + float32x4_t result = vmulq_f32(src_left, kx_vec); + result = vmlaq_f32(result, src_mid, kx_vec); + result = vmlaq_f32(result, src_right, kx_vec); + + vst1q_f32(&buf[i][j], result); + } + } + + // Apply vertical kernel + for (int i = 1; i < h - 1; i++) { + for (int j = 1; j < w - 1; j += 4) { + float32x4_t buf_top = vld1q_f32(&buf[i - 1][j]); + float32x4_t buf_mid = vld1q_f32(&buf[i][j]); + float32x4_t buf_bottom = vld1q_f32(&buf[i + 1][j]); + + float32x4_t result = vmulq_f32(buf_top, ky_vec); + result = vmlaq_f32(result, buf_mid, ky_vec); + result = vmlaq_f32(result, buf_bottom, ky_vec); + + vst1q_f32(&dst[i][j], result); + } + } +} + +int main() { + float src[H][W] = { + {1, 2, 3, 4, 5}, + {6, 7, 8, 9, 10}, + {11, 12, 13, 14, 15}, + {16, 17, 18, 19, 20}, + {21, 22, 23, 24, 25} + }; + float kx[3] = {1 / 16.0, 2 / 16.0, 1 / 16.0}; + float ky[3] = {1 / 16.0, 2 / 16.0, 1 / 16.0}; + float dst[H][W] = {0}; + + clock_t start = clock(); + applySeparableGaussianBlur(src, dst, H, W, kx, ky); + clock_t end = clock(); + + printf("Output matrix:\n"); + for (int i = 0; i < H; i++) { + for (int j = 0; j < W; j++) { + printf("%.2f ", dst[i][j]); + } + printf("\n"); + } + + printf("Execution time: %lf seconds\n", (double)(end - start) / CLOCKS_PER_SEC); + return 0; +}