Add 3

9 months ago · 18c7aab1d6
parent 7d28c79ebe
commit 18c7aab1d6
1 changed files with 97 additions and 0 deletions
--- a/97
+++ b/97
@ -0,0 +1,97 @@
+#include <iostream>
+#include <arm_neon.h>
+#include <ctime>
+
+using namespace std;
+
+// 输入图像的大小
+const int HEIGHT = 5;
+const int WIDTH = 5;
+
+// 高斯模糊核分解
+const float kx[3] = {0.27901f, 0.44198f, 0.27901f};
+const float ky[3] = {0.27901f, 0.44198f, 0.27901f};
+
+void applySeparableGaussianBlur(float src[][WIDTH], float dst[][WIDTH], int h, int w, const float kx[3], const float ky[3]) {
+    // 中间缓存，用于存储行方向卷积结果
+    float buf[3][WIDTH] = {0};
+    int bufIndex = 0;
+
+    // NEON向量化
+    float32x4_t kx_vec = vld1q_f32(kx); // 加载行方向权重到向量
+
+    for (int i = 0; i < h; ++i) {
+        // 行方向卷积
+        for (int j = 0; j < w - 2; j += 4) { // 每次处理4个元素
+            // 加载3个相邻像素到向量
+            float32x4_t src0 = vld1q_f32(&src[i][j]);
+            float32x4_t src1 = vld1q_f32(&src[i][j + 1]);
+            float32x4_t src2 = vld1q_f32(&src[i][j + 2]);
+
+            // 计算加权和
+            float32x4_t result = vmulq_lane_f32(src0, vget_low_f32(kx_vec), 0); // src0 * kx[0]
+            result = vmlaq_lane_f32(result, src1, vget_low_f32(kx_vec), 1);    // + src1 * kx[1]
+            result = vmlaq_lane_f32(result, src2, vget_high_f32(kx_vec), 0);   // + src2 * kx[2]
+
+            // 保存行方向卷积结果
+            vst1q_f32(&buf[bufIndex][j], result);
+        }
+
+        // 行间卷积（当凑齐3行时计算）
+        if (i >= 2) {
+            for (int j = 0; j < w - 2; j += 4) { // 每次处理4个元素
+                // 加载行方向结果到向量
+                float32x4_t buf0 = vld1q_f32(&buf[(bufIndex - 2 + 3) % 3][j]);
+                float32x4_t buf1 = vld1q_f32(&buf[(bufIndex - 1 + 3) % 3][j]);
+                float32x4_t buf2 = vld1q_f32(&buf[bufIndex][j]);
+
+                // 计算列方向加权和
+                float32x4_t result = vmulq_lane_f32(buf0, vget_low_f32(kx_vec), 0); // buf0 * ky[0]
+                result = vmlaq_lane_f32(result, buf1, vget_low_f32(kx_vec), 1);    // + buf1 * ky[1]
+                result = vmlaq_lane_f32(result, buf2, vget_high_f32(kx_vec), 0);   // + buf2 * ky[2]
+
+                // 保存最终结果
+                vst1q_f32(&dst[i - 1][j], result);
+            }
+        }
+
+        // 更新缓冲区索引
+        bufIndex = (bufIndex + 1) % 3;
+    }
+}
+
+int main() {
+    // 输入矩阵
+    float src[HEIGHT][WIDTH] = {
+        {10, 20, 30, 40, 50},
+        {60, 70, 80, 90, 100},
+        {110, 120, 130, 140, 150},
+        {160, 170, 180, 190, 200},
+        {210, 220, 230, 240, 250},
+    };
+    float dst[HEIGHT][WIDTH] = {0};
+
+    // 计时开始
+    clock_t start = clock();
+
+    // 调用优化后的GaussianBlur
+    applySeparableGaussianBlur(src, dst, HEIGHT, WIDTH, kx, ky);
+
+    // 计时结束
+    clock_t end = clock();
+    double elapsed = double(end - start) / CLOCKS_PER_SEC;
+
+    // 输出结果矩阵
+    cout << "Blurred Image:" << endl;
+    for (int i = 0; i < HEIGHT; ++i) {
+        for (int j = 0; j < WIDTH; ++j) {
+            cout << dst[i][j] << "\t";
+        }
+        cout << endl;
+    }
+
+    // 输出运行时间
+    cout << "Execution time: " << elapsed << " seconds" << endl;
+
+    return 0;
+}