|
|
|
|
@ -0,0 +1,163 @@
|
|
|
|
|
#include <stdio.h>
|
|
|
|
|
#include <time.h>
|
|
|
|
|
#include <arm_neon.h> // 包含 NEON 指令集的头文件
|
|
|
|
|
|
|
|
|
|
// 定义步骤1的高斯模糊函数
|
|
|
|
|
void applyGaussianBlur(float src[5][5], float dst[5][5], int h, int w, float kernel[3][3]) {
|
|
|
|
|
int i, j, m, n;
|
|
|
|
|
float sum;
|
|
|
|
|
|
|
|
|
|
for (i = 1; i < h - 1; i++) {
|
|
|
|
|
for (j = 1; j < w - 1; j++) {
|
|
|
|
|
sum = 0.0;
|
|
|
|
|
for (m = -1; m <= 1; m++) {
|
|
|
|
|
for (n = -1; n <= 1; n++) {
|
|
|
|
|
sum += src[i + m][j + n] * kernel[m + 1][n + 1];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
dst[i][j] = sum;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// 定义步骤2的可分离高斯模糊函数
|
|
|
|
|
void applySeparableGaussianBlur(float src[5][5], float dst[5][5], int h, int w, float kx[3], float ky[3]) {
|
|
|
|
|
int i, j, m, n;
|
|
|
|
|
float buf[3][5];
|
|
|
|
|
float sum;
|
|
|
|
|
|
|
|
|
|
for (i = 1; i < h - 1; i++) {
|
|
|
|
|
for (j = 0; j < w; j++) {
|
|
|
|
|
sum = 0.0;
|
|
|
|
|
for (m = -1; m <= 1; m++) {
|
|
|
|
|
if (j + m >= 0 && j + m < w) {
|
|
|
|
|
sum += src[i][j + m] * kx[m + 1];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
buf[0][j] = sum;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (j = 1; j < w - 1; j++) {
|
|
|
|
|
sum = 0.0;
|
|
|
|
|
for (n = -1; n <= 1; n++) {
|
|
|
|
|
if (i + n >= 0 && i + n < h) {
|
|
|
|
|
sum += buf[n + 1][j] * ky[n + 1];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
dst[i][j] = sum;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// 定义步骤3的可分离高斯模糊函数,使用NEON优化
|
|
|
|
|
void applySeparableGaussianBlurNEON(float src[5][5], float dst[5][5], int h, int w, float kx[3], float ky[3]) {
|
|
|
|
|
int i, j, m, n;
|
|
|
|
|
float buf[3][5];
|
|
|
|
|
float sum;
|
|
|
|
|
|
|
|
|
|
for (i = 1; i < h - 1; i++) {
|
|
|
|
|
for (j = 0; j < w; j++) {
|
|
|
|
|
sum = 0.0;
|
|
|
|
|
float32x4_t kernel = vld1q_f32(kx);
|
|
|
|
|
|
|
|
|
|
for (m = -1; m <= 1; m++) {
|
|
|
|
|
if (j + m >= 0 && j + m < w) {
|
|
|
|
|
float32x4_t data = vld1q_f32(&src[i][j + m]);
|
|
|
|
|
float32x4_t result = vmulq_f32(data, kernel);
|
|
|
|
|
sum += vgetq_lane_f32(result, 0) + vgetq_lane_f32(result, 1) + vgetq_lane_f32(result, 2) + vgetq_lane_f32(result, 3);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
buf[0][j] = sum;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (j = 1; j < w - 1; j++) {
|
|
|
|
|
sum = 0.0;
|
|
|
|
|
float32x4_t kernel_col = vld1q_f32(ky);
|
|
|
|
|
|
|
|
|
|
for (n = -1; n <= 1; n++) {
|
|
|
|
|
if (i + n >= 0 && i + n < h) {
|
|
|
|
|
float32x4_t data = vld1q_f32(&buf[n + 1][j]);
|
|
|
|
|
float32x4_t result = vmulq_f32(data, kernel_col);
|
|
|
|
|
sum += vgetq_lane_f32(result, 0) + vgetq_lane_f32(result, 1) + vgetq_lane_f32(result, 2) + vgetq_lane_f32(result, 3);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
dst[i][j] = sum;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int main() {
|
|
|
|
|
float src[5][5] = {
|
|
|
|
|
{1, 1, 1, 1, 1},
|
|
|
|
|
{1, 2, 2, 2, 1},
|
|
|
|
|
{1, 2, 4, 2, 1},
|
|
|
|
|
{1, 2, 2, 2, 1},
|
|
|
|
|
{1, 1, 1, 1, 1}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
float kernel[3][3] = {
|
|
|
|
|
{1.0 / 16.0, 2.0 / 16.0, 1.0 / 16.0},
|
|
|
|
|
{2.0 / 16.0, 4.0 / 16.0, 2.0 / 16.0},
|
|
|
|
|
{1.0 / 16.0, 2.0 / 16.0, 1.0 / 16.0}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
float kx[3] = {1.0 / 16.0, 2.0 / 16.0, 1.0 / 16.0};
|
|
|
|
|
float ky[3] = {1.0 / 16.0, 2.0 / 16.0, 1.0 / 16.0};
|
|
|
|
|
|
|
|
|
|
float dst1[5][5] = {0};
|
|
|
|
|
float dst2[5][5] = {0};
|
|
|
|
|
float dst3[5][5] = {0};
|
|
|
|
|
|
|
|
|
|
// Step 1: 使用普通高斯模糊
|
|
|
|
|
clock_t start1 = clock();
|
|
|
|
|
applyGaussianBlur(src, dst1, 5, 5, kernel);
|
|
|
|
|
clock_t end1 = clock();
|
|
|
|
|
|
|
|
|
|
// Step 2: 使用可分离高斯模糊
|
|
|
|
|
clock_t start2 = clock();
|
|
|
|
|
applySeparableGaussianBlur(src, dst2, 5, 5, kx, ky);
|
|
|
|
|
clock_t end2 = clock();
|
|
|
|
|
|
|
|
|
|
// Step 3: 使用NEON优化的可分离高斯模糊
|
|
|
|
|
clock_t start3 = clock();
|
|
|
|
|
applySeparableGaussianBlurNEON(src, dst3, 5, 5, kx, ky);
|
|
|
|
|
clock_t end3 = clock();
|
|
|
|
|
|
|
|
|
|
// 输出结果对比
|
|
|
|
|
printf("Output comparison:\n");
|
|
|
|
|
|
|
|
|
|
int i, j;
|
|
|
|
|
int consistent = 1; // 用于检查输出是否一致
|
|
|
|
|
|
|
|
|
|
// 对比dst1与dst2
|
|
|
|
|
for (i = 0; i < 5; i++) {
|
|
|
|
|
for (j = 0; j < 5; j++) {
|
|
|
|
|
if (fabs(dst1[i][j] - dst2[i][j]) > 0.01) {
|
|
|
|
|
consistent = 0;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
// 对比dst2与dst3
|
|
|
|
|
for (i = 0; i < 5; i++) {
|
|
|
|
|
for (j = 0; j < 5; j++) {
|
|
|
|
|
if (fabs(dst2[i][j] - dst3[i][j]) > 0.01) {
|
|
|
|
|
consistent = 0;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (consistent) {
|
|
|
|
|
printf("All outputs are consistent.\n");
|
|
|
|
|
} else {
|
|
|
|
|
printf("Outputs are inconsistent.\n");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// 输出执行时间
|
|
|
|
|
printf("\nExecution time comparison:\n");
|
|
|
|
|
printf("Step 1 execution time: %.6f seconds\n", (double)(end1 - start1) / CLOCKS_PER_SEC);
|
|
|
|
|
printf("Step 2 execution time: %.6f seconds\n", (double)(end2 - start2) / CLOCKS_PER_SEC);
|
|
|
|
|
printf("Step 3 execution time: %.6f seconds\n", (double)(end3 - start3) / CLOCKS_PER_SEC);
|
|
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|