|
|
|
@ -0,0 +1,97 @@
|
|
|
|
|
#include <iostream>
|
|
|
|
|
#include <arm_neon.h>
|
|
|
|
|
#include <ctime>
|
|
|
|
|
|
|
|
|
|
using namespace std;
|
|
|
|
|
|
|
|
|
|
// 输入图像的大小
|
|
|
|
|
const int HEIGHT = 5;
|
|
|
|
|
const int WIDTH = 5;
|
|
|
|
|
|
|
|
|
|
// 高斯模糊核分解
|
|
|
|
|
const float kx[3] = {0.27901f, 0.44198f, 0.27901f};
|
|
|
|
|
const float ky[3] = {0.27901f, 0.44198f, 0.27901f};
|
|
|
|
|
|
|
|
|
|
void applySeparableGaussianBlur(float src[][WIDTH], float dst[][WIDTH], int h, int w, const float kx[3], const float ky[3]) {
|
|
|
|
|
// 中间缓存,用于存储行方向卷积结果
|
|
|
|
|
float buf[3][WIDTH] = {0};
|
|
|
|
|
int bufIndex = 0;
|
|
|
|
|
|
|
|
|
|
// NEON向量化
|
|
|
|
|
float32x4_t kx_vec = vld1q_f32(kx); // 加载行方向权重到向量
|
|
|
|
|
|
|
|
|
|
for (int i = 0; i < h; ++i) {
|
|
|
|
|
// 行方向卷积
|
|
|
|
|
for (int j = 0; j < w - 2; j += 4) { // 每次处理4个元素
|
|
|
|
|
// 加载3个相邻像素到向量
|
|
|
|
|
float32x4_t src0 = vld1q_f32(&src[i][j]);
|
|
|
|
|
float32x4_t src1 = vld1q_f32(&src[i][j + 1]);
|
|
|
|
|
float32x4_t src2 = vld1q_f32(&src[i][j + 2]);
|
|
|
|
|
|
|
|
|
|
// 计算加权和
|
|
|
|
|
float32x4_t result = vmulq_lane_f32(src0, vget_low_f32(kx_vec), 0); // src0 * kx[0]
|
|
|
|
|
result = vmlaq_lane_f32(result, src1, vget_low_f32(kx_vec), 1); // + src1 * kx[1]
|
|
|
|
|
result = vmlaq_lane_f32(result, src2, vget_high_f32(kx_vec), 0); // + src2 * kx[2]
|
|
|
|
|
|
|
|
|
|
// 保存行方向卷积结果
|
|
|
|
|
vst1q_f32(&buf[bufIndex][j], result);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// 行间卷积(当凑齐3行时计算)
|
|
|
|
|
if (i >= 2) {
|
|
|
|
|
for (int j = 0; j < w - 2; j += 4) { // 每次处理4个元素
|
|
|
|
|
// 加载行方向结果到向量
|
|
|
|
|
float32x4_t buf0 = vld1q_f32(&buf[(bufIndex - 2 + 3) % 3][j]);
|
|
|
|
|
float32x4_t buf1 = vld1q_f32(&buf[(bufIndex - 1 + 3) % 3][j]);
|
|
|
|
|
float32x4_t buf2 = vld1q_f32(&buf[bufIndex][j]);
|
|
|
|
|
|
|
|
|
|
// 计算列方向加权和
|
|
|
|
|
float32x4_t result = vmulq_lane_f32(buf0, vget_low_f32(kx_vec), 0); // buf0 * ky[0]
|
|
|
|
|
result = vmlaq_lane_f32(result, buf1, vget_low_f32(kx_vec), 1); // + buf1 * ky[1]
|
|
|
|
|
result = vmlaq_lane_f32(result, buf2, vget_high_f32(kx_vec), 0); // + buf2 * ky[2]
|
|
|
|
|
|
|
|
|
|
// 保存最终结果
|
|
|
|
|
vst1q_f32(&dst[i - 1][j], result);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// 更新缓冲区索引
|
|
|
|
|
bufIndex = (bufIndex + 1) % 3;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int main() {
|
|
|
|
|
// 输入矩阵
|
|
|
|
|
float src[HEIGHT][WIDTH] = {
|
|
|
|
|
{10, 20, 30, 40, 50},
|
|
|
|
|
{60, 70, 80, 90, 100},
|
|
|
|
|
{110, 120, 130, 140, 150},
|
|
|
|
|
{160, 170, 180, 190, 200},
|
|
|
|
|
{210, 220, 230, 240, 250},
|
|
|
|
|
};
|
|
|
|
|
float dst[HEIGHT][WIDTH] = {0};
|
|
|
|
|
|
|
|
|
|
// 计时开始
|
|
|
|
|
clock_t start = clock();
|
|
|
|
|
|
|
|
|
|
// 调用优化后的GaussianBlur
|
|
|
|
|
applySeparableGaussianBlur(src, dst, HEIGHT, WIDTH, kx, ky);
|
|
|
|
|
|
|
|
|
|
// 计时结束
|
|
|
|
|
clock_t end = clock();
|
|
|
|
|
double elapsed = double(end - start) / CLOCKS_PER_SEC;
|
|
|
|
|
|
|
|
|
|
// 输出结果矩阵
|
|
|
|
|
cout << "Blurred Image:" << endl;
|
|
|
|
|
for (int i = 0; i < HEIGHT; ++i) {
|
|
|
|
|
for (int j = 0; j < WIDTH; ++j) {
|
|
|
|
|
cout << dst[i][j] << "\t";
|
|
|
|
|
}
|
|
|
|
|
cout << endl;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// 输出运行时间
|
|
|
|
|
cout << "Execution time: " << elapsed << " seconds" << endl;
|
|
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
}
|