You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

98 lines
3.1 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

#include <iostream>
#include <arm_neon.h>
#include <ctime>
using namespace std;
// 输入图像的大小
const int HEIGHT = 5;
const int WIDTH = 5;
// 高斯模糊核分解
const float kx[3] = {0.27901f, 0.44198f, 0.27901f};
const float ky[3] = {0.27901f, 0.44198f, 0.27901f};
void applySeparableGaussianBlur(float src[][WIDTH], float dst[][WIDTH], int h, int w, const float kx[3], const float ky[3]) {
// 中间缓存,用于存储行方向卷积结果
float buf[3][WIDTH] = {0};
int bufIndex = 0;
// NEON向量化
float32x4_t kx_vec = vld1q_f32(kx); // 加载行方向权重到向量
for (int i = 0; i < h; ++i) {
// 行方向卷积
for (int j = 0; j < w - 2; j += 4) { // 每次处理4个元素
// 加载3个相邻像素到向量
float32x4_t src0 = vld1q_f32(&src[i][j]);
float32x4_t src1 = vld1q_f32(&src[i][j + 1]);
float32x4_t src2 = vld1q_f32(&src[i][j + 2]);
// 计算加权和
float32x4_t result = vmulq_lane_f32(src0, vget_low_f32(kx_vec), 0); // src0 * kx[0]
result = vmlaq_lane_f32(result, src1, vget_low_f32(kx_vec), 1); // + src1 * kx[1]
result = vmlaq_lane_f32(result, src2, vget_high_f32(kx_vec), 0); // + src2 * kx[2]
// 保存行方向卷积结果
vst1q_f32(&buf[bufIndex][j], result);
}
// 行间卷积当凑齐3行时计算
if (i >= 2) {
for (int j = 0; j < w - 2; j += 4) { // 每次处理4个元素
// 加载行方向结果到向量
float32x4_t buf0 = vld1q_f32(&buf[(bufIndex - 2 + 3) % 3][j]);
float32x4_t buf1 = vld1q_f32(&buf[(bufIndex - 1 + 3) % 3][j]);
float32x4_t buf2 = vld1q_f32(&buf[bufIndex][j]);
// 计算列方向加权和
float32x4_t result = vmulq_lane_f32(buf0, vget_low_f32(kx_vec), 0); // buf0 * ky[0]
result = vmlaq_lane_f32(result, buf1, vget_low_f32(kx_vec), 1); // + buf1 * ky[1]
result = vmlaq_lane_f32(result, buf2, vget_high_f32(kx_vec), 0); // + buf2 * ky[2]
// 保存最终结果
vst1q_f32(&dst[i - 1][j], result);
}
}
// 更新缓冲区索引
bufIndex = (bufIndex + 1) % 3;
}
}
int main() {
// 输入矩阵
float src[HEIGHT][WIDTH] = {
{10, 20, 30, 40, 50},
{60, 70, 80, 90, 100},
{110, 120, 130, 140, 150},
{160, 170, 180, 190, 200},
{210, 220, 230, 240, 250},
};
float dst[HEIGHT][WIDTH] = {0};
// 计时开始
clock_t start = clock();
// 调用优化后的GaussianBlur
applySeparableGaussianBlur(src, dst, HEIGHT, WIDTH, kx, ky);
// 计时结束
clock_t end = clock();
double elapsed = double(end - start) / CLOCKS_PER_SEC;
// 输出结果矩阵
cout << "Blurred Image:" << endl;
for (int i = 0; i < HEIGHT; ++i) {
for (int j = 0; j < WIDTH; ++j) {
cout << dst[i][j] << "\t";
}
cout << endl;
}
// 输出运行时间
cout << "Execution time: " << elapsed << " seconds" << endl;
return 0;
}