#include #include #include #include #define H 5 #define W 5 void applyGaussianBlur(float src[H][W], float dst[H][W], int h, int w, float kernel[3][3]) { // 将卷积核的值加载到 NEON 寄存器中 float32x4_t k0 = vdupq_n_f32(kernel[0][0] * kernel[1][1] * kernel[2][2]); float32x4_t k1 = vdupq_n_f32(kernel[0][1] * kernel[1][1] * kernel[2][1]); float32x4_t k2 = vdupq_n_f32(kernel[0][2] * kernel[1][1] * kernel[2][0]); for (int i = 1; i < h - 1; ++i) { for (int j = 1; j <= w - 3; j += 4) { // 每次处理4个像素 // 加载src中的像素到NEON寄存器 float32x4_t top = vld1q_f32(&src[i-1][j-1]); float32x4_t mid = vld1q_f32(&src[i][j-1]); float32x4_t bot = vld1q_f32(&src[i+1][j-1]); // 执行卷积操作 float32x4_t result = vmulq_f32(top, k0); // top-left result = vmlaq_f32(result, vextq_f32(top, top, 1), k1); // top result = vmlaq_f32(result, vextq_f32(top, top, 2), k2); // top-right result = vmlaq_f32(result, mid, k1); // mid result = vmlaq_f32(result, vextq_f32(mid, mid, 1), k1); // mid-right result = vmlaq_f32(result, bot, k0); // bot-left result = vmlaq_f32(result, vextq_f32(bot, bot, 1), k1); // bot result = vmlaq_f32(result, vextq_f32(bot, bot, 2), k2); // bot-right // 存储结果到dst vst1q_f32(&dst[i][j], result); } } } int main() { float inputImage[H][W] = {0}; float dst[H][W] = {0}; float kernel[3][3] = { {1.0f/16, 2.0f/16, 1.0f/16}, {2.0f/16, 4.0f/16, 2.0f/16}, {1.0f/16, 2.0f/16, 1.0f/16} }; // Initialize inputImage with some values for (int i = 0; i < H; i++) { for (int j = 0; j < W; j++) { inputImage[i][j] = i * W + j; } } clock_t start = clock(); applyGaussianBlur(inputImage, dst, H, W, kernel); clock_t end = clock(); double time_spent = double(end - start) / CLOCKS_PER_SEC; printf("运行时间:%lf秒\ndst矩阵结果为\n", time_spent); for (int i = 0; i < H; i++) { for (int j = 0; j < W; j++) { printf("%f ", dst[i][j]); } printf("\n"); } return 0; }