From 18c7aab1d6f2e50d5944cdd102a0cb3bc2ff5f4f Mon Sep 17 00:00:00 2001 From: pjc97uv2f <1291955673@qq.com> Date: Sun, 1 Dec 2024 22:25:00 +0800 Subject: [PATCH] Add 3 --- 3 | 97 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100644 3 diff --git a/3 b/3 new file mode 100644 index 0000000..9b39961 --- /dev/null +++ b/3 @@ -0,0 +1,97 @@ +#include +#include +#include + +using namespace std; + +// 输入图像的大小 +const int HEIGHT = 5; +const int WIDTH = 5; + +// 高斯模糊核分解 +const float kx[3] = {0.27901f, 0.44198f, 0.27901f}; +const float ky[3] = {0.27901f, 0.44198f, 0.27901f}; + +void applySeparableGaussianBlur(float src[][WIDTH], float dst[][WIDTH], int h, int w, const float kx[3], const float ky[3]) { + // 中间缓存,用于存储行方向卷积结果 + float buf[3][WIDTH] = {0}; + int bufIndex = 0; + + // NEON向量化 + float32x4_t kx_vec = vld1q_f32(kx); // 加载行方向权重到向量 + + for (int i = 0; i < h; ++i) { + // 行方向卷积 + for (int j = 0; j < w - 2; j += 4) { // 每次处理4个元素 + // 加载3个相邻像素到向量 + float32x4_t src0 = vld1q_f32(&src[i][j]); + float32x4_t src1 = vld1q_f32(&src[i][j + 1]); + float32x4_t src2 = vld1q_f32(&src[i][j + 2]); + + // 计算加权和 + float32x4_t result = vmulq_lane_f32(src0, vget_low_f32(kx_vec), 0); // src0 * kx[0] + result = vmlaq_lane_f32(result, src1, vget_low_f32(kx_vec), 1); // + src1 * kx[1] + result = vmlaq_lane_f32(result, src2, vget_high_f32(kx_vec), 0); // + src2 * kx[2] + + // 保存行方向卷积结果 + vst1q_f32(&buf[bufIndex][j], result); + } + + // 行间卷积(当凑齐3行时计算) + if (i >= 2) { + for (int j = 0; j < w - 2; j += 4) { // 每次处理4个元素 + // 加载行方向结果到向量 + float32x4_t buf0 = vld1q_f32(&buf[(bufIndex - 2 + 3) % 3][j]); + float32x4_t buf1 = vld1q_f32(&buf[(bufIndex - 1 + 3) % 3][j]); + float32x4_t buf2 = vld1q_f32(&buf[bufIndex][j]); + + // 计算列方向加权和 + float32x4_t result = vmulq_lane_f32(buf0, vget_low_f32(kx_vec), 0); // buf0 * ky[0] + result = vmlaq_lane_f32(result, buf1, vget_low_f32(kx_vec), 1); // + buf1 * ky[1] + result = vmlaq_lane_f32(result, buf2, vget_high_f32(kx_vec), 0); // + buf2 * ky[2] + + // 保存最终结果 + vst1q_f32(&dst[i - 1][j], result); + } + } + + // 更新缓冲区索引 + bufIndex = (bufIndex + 1) % 3; + } +} + +int main() { + // 输入矩阵 + float src[HEIGHT][WIDTH] = { + {10, 20, 30, 40, 50}, + {60, 70, 80, 90, 100}, + {110, 120, 130, 140, 150}, + {160, 170, 180, 190, 200}, + {210, 220, 230, 240, 250}, + }; + float dst[HEIGHT][WIDTH] = {0}; + + // 计时开始 + clock_t start = clock(); + + // 调用优化后的GaussianBlur + applySeparableGaussianBlur(src, dst, HEIGHT, WIDTH, kx, ky); + + // 计时结束 + clock_t end = clock(); + double elapsed = double(end - start) / CLOCKS_PER_SEC; + + // 输出结果矩阵 + cout << "Blurred Image:" << endl; + for (int i = 0; i < HEIGHT; ++i) { + for (int j = 0; j < WIDTH; ++j) { + cout << dst[i][j] << "\t"; + } + cout << endl; + } + + // 输出运行时间 + cout << "Execution time: " << elapsed << " seconds" << endl; + + return 0; +}