From 27dd9a6fe43e8258a182fbfda43eb8f502eeed21 Mon Sep 17 00:00:00 2001 From: pv3e4i5aj Date: Sat, 7 Dec 2024 02:44:35 +0800 Subject: [PATCH] Add task1_pro.cpp --- task1_pro.cpp | 68 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 task1_pro.cpp diff --git a/task1_pro.cpp b/task1_pro.cpp new file mode 100644 index 0000000..70b963f --- /dev/null +++ b/task1_pro.cpp @@ -0,0 +1,68 @@ +#include +#include +#include +#include + +#define H 5 +#define W 5 + +void applyGaussianBlur(float src[H][W], float dst[H][W], int h, int w, float kernel[3][3]) { + // 将卷积核的值加载到 NEON 寄存器中 + float32x4_t k0 = vdupq_n_f32(kernel[0][0] * kernel[1][1] * kernel[2][2]); + float32x4_t k1 = vdupq_n_f32(kernel[0][1] * kernel[1][1] * kernel[2][1]); + float32x4_t k2 = vdupq_n_f32(kernel[0][2] * kernel[1][1] * kernel[2][0]); + + for (int i = 1; i < h - 1; ++i) { + for (int j = 1; j <= w - 3; j += 4) { // 每次处理4个像素 + // 加载src中的像素到NEON寄存器 + float32x4_t top = vld1q_f32(&src[i-1][j-1]); + float32x4_t mid = vld1q_f32(&src[i][j-1]); + float32x4_t bot = vld1q_f32(&src[i+1][j-1]); + + // 执行卷积操作 + float32x4_t result = vmulq_f32(top, k0); // top-left + result = vmlaq_f32(result, vextq_f32(top, top, 1), k1); // top + result = vmlaq_f32(result, vextq_f32(top, top, 2), k2); // top-right + result = vmlaq_f32(result, mid, k1); // mid + result = vmlaq_f32(result, vextq_f32(mid, mid, 1), k1); // mid-right + result = vmlaq_f32(result, bot, k0); // bot-left + result = vmlaq_f32(result, vextq_f32(bot, bot, 1), k1); // bot + result = vmlaq_f32(result, vextq_f32(bot, bot, 2), k2); // bot-right + + // 存储结果到dst + vst1q_f32(&dst[i][j], result); + } + } +} + +int main() { + float inputImage[H][W] = {0}; + float dst[H][W] = {0}; + float kernel[3][3] = { + {1.0f/16, 2.0f/16, 1.0f/16}, + {2.0f/16, 4.0f/16, 2.0f/16}, + {1.0f/16, 2.0f/16, 1.0f/16} + }; + + // Initialize inputImage with some values + for (int i = 0; i < H; i++) { + for (int j = 0; j < W; j++) { + inputImage[i][j] = i * W + j; + } + } + + clock_t start = clock(); + applyGaussianBlur(inputImage, dst, H, W, kernel); + clock_t end = clock(); + double time_spent = double(end - start) / CLOCKS_PER_SEC; + printf("运行时间:%lf秒\ndst矩阵结果为\n", time_spent); + + for (int i = 0; i < H; i++) { + for (int j = 0; j < W; j++) { + printf("%f ", dst[i][j]); + } + printf("\n"); + } + + return 0; +} \ No newline at end of file