pjc97uv2f 9 months ago
parent 7d28c79ebe
commit 18c7aab1d6

97
3

@ -0,0 +1,97 @@
#include <iostream>
#include <arm_neon.h>
#include <ctime>
using namespace std;
// 输入图像的大小
const int HEIGHT = 5;
const int WIDTH = 5;
// 高斯模糊核分解
const float kx[3] = {0.27901f, 0.44198f, 0.27901f};
const float ky[3] = {0.27901f, 0.44198f, 0.27901f};
void applySeparableGaussianBlur(float src[][WIDTH], float dst[][WIDTH], int h, int w, const float kx[3], const float ky[3]) {
// 中间缓存,用于存储行方向卷积结果
float buf[3][WIDTH] = {0};
int bufIndex = 0;
// NEON向量化
float32x4_t kx_vec = vld1q_f32(kx); // 加载行方向权重到向量
for (int i = 0; i < h; ++i) {
// 行方向卷积
for (int j = 0; j < w - 2; j += 4) { // 每次处理4个元素
// 加载3个相邻像素到向量
float32x4_t src0 = vld1q_f32(&src[i][j]);
float32x4_t src1 = vld1q_f32(&src[i][j + 1]);
float32x4_t src2 = vld1q_f32(&src[i][j + 2]);
// 计算加权和
float32x4_t result = vmulq_lane_f32(src0, vget_low_f32(kx_vec), 0); // src0 * kx[0]
result = vmlaq_lane_f32(result, src1, vget_low_f32(kx_vec), 1); // + src1 * kx[1]
result = vmlaq_lane_f32(result, src2, vget_high_f32(kx_vec), 0); // + src2 * kx[2]
// 保存行方向卷积结果
vst1q_f32(&buf[bufIndex][j], result);
}
// 行间卷积当凑齐3行时计算
if (i >= 2) {
for (int j = 0; j < w - 2; j += 4) { // 每次处理4个元素
// 加载行方向结果到向量
float32x4_t buf0 = vld1q_f32(&buf[(bufIndex - 2 + 3) % 3][j]);
float32x4_t buf1 = vld1q_f32(&buf[(bufIndex - 1 + 3) % 3][j]);
float32x4_t buf2 = vld1q_f32(&buf[bufIndex][j]);
// 计算列方向加权和
float32x4_t result = vmulq_lane_f32(buf0, vget_low_f32(kx_vec), 0); // buf0 * ky[0]
result = vmlaq_lane_f32(result, buf1, vget_low_f32(kx_vec), 1); // + buf1 * ky[1]
result = vmlaq_lane_f32(result, buf2, vget_high_f32(kx_vec), 0); // + buf2 * ky[2]
// 保存最终结果
vst1q_f32(&dst[i - 1][j], result);
}
}
// 更新缓冲区索引
bufIndex = (bufIndex + 1) % 3;
}
}
int main() {
// 输入矩阵
float src[HEIGHT][WIDTH] = {
{10, 20, 30, 40, 50},
{60, 70, 80, 90, 100},
{110, 120, 130, 140, 150},
{160, 170, 180, 190, 200},
{210, 220, 230, 240, 250},
};
float dst[HEIGHT][WIDTH] = {0};
// 计时开始
clock_t start = clock();
// 调用优化后的GaussianBlur
applySeparableGaussianBlur(src, dst, HEIGHT, WIDTH, kx, ky);
// 计时结束
clock_t end = clock();
double elapsed = double(end - start) / CLOCKS_PER_SEC;
// 输出结果矩阵
cout << "Blurred Image:" << endl;
for (int i = 0; i < HEIGHT; ++i) {
for (int j = 0; j < WIDTH; ++j) {
cout << dst[i][j] << "\t";
}
cout << endl;
}
// 输出运行时间
cout << "Execution time: " << elapsed << " seconds" << endl;
return 0;
}
Loading…
Cancel
Save