ADD file via upload

3 months ago · 02d7ffebe2
parent 7ce6b2ad6b
commit 02d7ffebe2
1 changed files with 110 additions and 0 deletions
--- a/step3.c
+++ b/step3.c
@ -0,0 +1,110 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <arm_neon.h>
+void applyGaussianBlur(float src[][100], float dst[][100], int h, int w, float kx[3], float ky[3]) {
+    float temp[100][100];
+    for (int y = 0; y < h; y++) {
+        for (int x = 0; x < w; x++) {
+            float sum = 0.0;
+            for (int i = -1; i <= 1; i++) {
+                int nx = x + i;
+                if (nx >= 0 && nx < w) {
+                    sum += src[y][nx] * kx[i + 1];
+                }
+            }
+            temp[y][x] = sum;
+        }
+    }
+    for (int y = 0; y < h; y++) {
+        for (int x = 0; x < w; x++) {
+            float sum = 0.0;
+            for (int i = -1; i <= 1; i++) {
+                int ny = y + i;
+                if (ny >= 0 && ny < h) {
+                    sum += temp[ny][x] * ky[i + 1];
+                }
+            }
+            dst[y][x] = sum;
+        }
+    }
+}
+void applySeparableGaussianBlur(float src[][100], float dst[][100], int h, int w, float kx[3], float ky[3]) {
+    float temp[100][100];
+    float32x4_t kx_vec = vdupq_n_f32(kx[1]); 
+    float32x4_t kx_left = vdupq_n_f32(kx[0]);
+    float32x4_t kx_right = vdupq_n_f32(kx[2]);
+    for (int y = 0; y < h; y++) {
+        for (int x = 1; x < w - 1; x += 4) {
+            float32x4_t src_vec = vld1q_f32(&src[y][x]);
+            float32x4_t left_vec = vld1q_f32(&src[y][x - 1]);
+            float32x4_t right_vec = vld1q_f32(&src[y][x + 1]);
+            
+            float32x4_t result_vec = vmlaq_f32(vmlaq_f32(kx_vec * src_vec, kx_left, left_vec), kx_right, right_vec);
+            
+            vst1q_f32(&temp[y][x], result_vec);
+        }
+        for (int x = 0; x < w; x++) {
+            if (x == 0 || x == w - 1) {
+                float sum = 0.0;
+                for (int i = -1; i <= 1; i++) {
+                    int nx = x + i;
+                    if (nx >= 0 && nx < w) {
+                        sum += src[y][nx] * kx[i + 1];
+                    }
+                }
+                temp[y][x] = sum;
+            }
+        }
+    }
+    kx_vec = vdupq_n_f32(ky[1]); 
+    kx_left = vdupq_n_f32(ky[0]);
+    kx_right = vdupq_n_f32(ky[2]);
+    for (int x = 0; x < w; x++) {
+        for (int y = 1; y < h - 1; y += 4) {
+            float32x4_t temp_vec = vld1q_f32(&temp[y][x]);
+            float32x4_t up_vec = vld1q_f32(&temp[y - 1][x]);
+            float32x4_t down_vec = vld1q_f32(&temp[y + 1][x]);
+            
+            float32x4_t result_vec = vmlaq_f32(vmlaq_f32(kx_vec * temp_vec, kx_left, up_vec), kx_right, down_vec);
+            
+            vst1q_f32(&dst[y][x], result_vec);
+        }
+        for (int y = 0; y < h; y++) {
+            if (y == 0 || y == h - 1) {
+                float sum = 0.0;
+                for (int i = -1; i <= 1; i++) {
+                    int ny = y + i;
+                    if (ny >= 0 && ny < h) {
+                        sum += temp[ny][x] * ky[i + 1];
+                    }
+                }
+                dst[y][x] = sum;
+            }
+        }
+    }
+}
+void benchmark(void (*func)(float[][100], float[][100], int, int, float[], float[]), float src[][100], float dst[][100], int h, int w, float kx[3], float ky[3]) {
+    clock_t start = clock();
+    func(src, dst, h, w, kx, ky);
+    clock_t end = clock();
+    double time_spent = (double)(end - start) / CLOCKS_PER_SEC;
+    printf("Time spent: %f seconds\n", time_spent);
+}
+int main() {
+    int h = 100, w = 100;
+    float src[100][100];
+    float dst[100][100];
+    float kx[3] = {0.25, 0.5, 0.25};
+    float ky[3] = {0.25, 0.5, 0.25};
+    for (int i = 0; i < h; i++) {
+        for (int j = 0; j < w; j++) {
+            src[i][j] = ((float)rand()) / RAND_MAX;
+        }
+    }
+    printf("Original Gaussian Blur:\n");
+    benchmark(applyGaussianBlur, src, dst, h, w, kx, ky);
+    printf("NEON Optimized Gaussian Blur:\n");
+    benchmark(applySeparableGaussianBlur, src, dst, h, w, kx, ky);
+    return 0;
+}