diff --git a/GaussianBlur_Step3_NEON.c b/GaussianBlur_Step3_NEON.c
new file mode 100644
index 0000000..8f7cd2a
--- /dev/null
+++ b/GaussianBlur_Step3_NEON.c
@@ -0,0 +1,73 @@
+
+#include <stdio.h>
+#include <time.h>
+#include <arm_neon.h>
+
+#define H 5
+#define W 5
+
+void applySeparableGaussianBlur(float src[H][W], float dst[H][W], int h, int w, float kx[3], float ky[3]) {
+    float buf[H][W] = {0};
+
+    // Load kernel into NEON registers
+    float32x4_t kx_vec = vld1q_dup_f32(kx);
+    float32x4_t ky_vec = vld1q_dup_f32(ky);
+
+    // Apply horizontal kernel
+    for (int i = 0; i < h; i++) {
+        for (int j = 1; j < w - 1; j += 4) {
+            float32x4_t src_left = vld1q_f32(&src[i][j - 1]);
+            float32x4_t src_mid = vld1q_f32(&src[i][j]);
+            float32x4_t src_right = vld1q_f32(&src[i][j + 1]);
+
+            float32x4_t result = vmulq_f32(src_left, kx_vec);
+            result = vmlaq_f32(result, src_mid, kx_vec);
+            result = vmlaq_f32(result, src_right, kx_vec);
+
+            vst1q_f32(&buf[i][j], result);
+        }
+    }
+
+    // Apply vertical kernel
+    for (int i = 1; i < h - 1; i++) {
+        for (int j = 1; j < w - 1; j += 4) {
+            float32x4_t buf_top = vld1q_f32(&buf[i - 1][j]);
+            float32x4_t buf_mid = vld1q_f32(&buf[i][j]);
+            float32x4_t buf_bottom = vld1q_f32(&buf[i + 1][j]);
+
+            float32x4_t result = vmulq_f32(buf_top, ky_vec);
+            result = vmlaq_f32(result, buf_mid, ky_vec);
+            result = vmlaq_f32(result, buf_bottom, ky_vec);
+
+            vst1q_f32(&dst[i][j], result);
+        }
+    }
+}
+
+int main() {
+    float src[H][W] = {
+        {1, 2, 3, 4, 5},
+        {6, 7, 8, 9, 10},
+        {11, 12, 13, 14, 15},
+        {16, 17, 18, 19, 20},
+        {21, 22, 23, 24, 25}
+    };
+    float kx[3] = {1 / 16.0, 2 / 16.0, 1 / 16.0};
+    float ky[3] = {1 / 16.0, 2 / 16.0, 1 / 16.0};
+    float dst[H][W] = {0};
+
+    clock_t start = clock();
+    applySeparableGaussianBlur(src, dst, H, W, kx, ky);
+    clock_t end = clock();
+
+    printf("Output matrix:\n");
+    for (int i = 0; i < H; i++) {
+        for (int j = 0; j < W; j++) {
+            printf("%.2f ", dst[i][j]);
+        }
+        printf("\n");
+    }
+
+    printf("Execution time: %lf seconds\n", (double)(end - start) / CLOCKS_PER_SEC);
+    return 0;
+}