#include #include #include using namespace std; // 输入图像的大小 const int HEIGHT = 5; const int WIDTH = 5; // 高斯模糊核分解 const float kx[3] = {0.27901f, 0.44198f, 0.27901f}; const float ky[3] = {0.27901f, 0.44198f, 0.27901f}; void applySeparableGaussianBlur(float src[][WIDTH], float dst[][WIDTH], int h, int w, const float kx[3], const float ky[3]) { // 中间缓存,用于存储行方向卷积结果 float buf[3][WIDTH] = {0}; int bufIndex = 0; // NEON向量化 float32x4_t kx_vec = vld1q_f32(kx); // 加载行方向权重到向量 for (int i = 0; i < h; ++i) { // 行方向卷积 for (int j = 0; j < w - 2; j += 4) { // 每次处理4个元素 // 加载3个相邻像素到向量 float32x4_t src0 = vld1q_f32(&src[i][j]); float32x4_t src1 = vld1q_f32(&src[i][j + 1]); float32x4_t src2 = vld1q_f32(&src[i][j + 2]); // 计算加权和 float32x4_t result = vmulq_lane_f32(src0, vget_low_f32(kx_vec), 0); // src0 * kx[0] result = vmlaq_lane_f32(result, src1, vget_low_f32(kx_vec), 1); // + src1 * kx[1] result = vmlaq_lane_f32(result, src2, vget_high_f32(kx_vec), 0); // + src2 * kx[2] // 保存行方向卷积结果 vst1q_f32(&buf[bufIndex][j], result); } // 行间卷积(当凑齐3行时计算) if (i >= 2) { for (int j = 0; j < w - 2; j += 4) { // 每次处理4个元素 // 加载行方向结果到向量 float32x4_t buf0 = vld1q_f32(&buf[(bufIndex - 2 + 3) % 3][j]); float32x4_t buf1 = vld1q_f32(&buf[(bufIndex - 1 + 3) % 3][j]); float32x4_t buf2 = vld1q_f32(&buf[bufIndex][j]); // 计算列方向加权和 float32x4_t result = vmulq_lane_f32(buf0, vget_low_f32(kx_vec), 0); // buf0 * ky[0] result = vmlaq_lane_f32(result, buf1, vget_low_f32(kx_vec), 1); // + buf1 * ky[1] result = vmlaq_lane_f32(result, buf2, vget_high_f32(kx_vec), 0); // + buf2 * ky[2] // 保存最终结果 vst1q_f32(&dst[i - 1][j], result); } } // 更新缓冲区索引 bufIndex = (bufIndex + 1) % 3; } } int main() { // 输入矩阵 float src[HEIGHT][WIDTH] = { {10, 20, 30, 40, 50}, {60, 70, 80, 90, 100}, {110, 120, 130, 140, 150}, {160, 170, 180, 190, 200}, {210, 220, 230, 240, 250}, }; float dst[HEIGHT][WIDTH] = {0}; // 计时开始 clock_t start = clock(); // 调用优化后的GaussianBlur applySeparableGaussianBlur(src, dst, HEIGHT, WIDTH, kx, ky); // 计时结束 clock_t end = clock(); double elapsed = double(end - start) / CLOCKS_PER_SEC; // 输出结果矩阵 cout << "Blurred Image:" << endl; for (int i = 0; i < HEIGHT; ++i) { for (int j = 0; j < WIDTH; ++j) { cout << dst[i][j] << "\t"; } cout << endl; } // 输出运行时间 cout << "Execution time: " << elapsed << " seconds" << endl; return 0; }