You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
68 lines
2.3 KiB
68 lines
2.3 KiB
#include <stdio.h>
|
|
#include <ctime>
|
|
#include <stdlib.h>
|
|
#include <arm_neon.h>
|
|
|
|
#define H 5
|
|
#define W 5
|
|
|
|
void applyGaussianBlur(float src[H][W], float dst[H][W], int h, int w, float kernel[3][3]) {
|
|
// 将卷积核的值加载到 NEON 寄存器中
|
|
float32x4_t k0 = vdupq_n_f32(kernel[0][0] * kernel[1][1] * kernel[2][2]);
|
|
float32x4_t k1 = vdupq_n_f32(kernel[0][1] * kernel[1][1] * kernel[2][1]);
|
|
float32x4_t k2 = vdupq_n_f32(kernel[0][2] * kernel[1][1] * kernel[2][0]);
|
|
|
|
for (int i = 1; i < h - 1; ++i) {
|
|
for (int j = 1; j <= w - 3; j += 4) { // 每次处理4个像素
|
|
// 加载src中的像素到NEON寄存器
|
|
float32x4_t top = vld1q_f32(&src[i-1][j-1]);
|
|
float32x4_t mid = vld1q_f32(&src[i][j-1]);
|
|
float32x4_t bot = vld1q_f32(&src[i+1][j-1]);
|
|
|
|
// 执行卷积操作
|
|
float32x4_t result = vmulq_f32(top, k0); // top-left
|
|
result = vmlaq_f32(result, vextq_f32(top, top, 1), k1); // top
|
|
result = vmlaq_f32(result, vextq_f32(top, top, 2), k2); // top-right
|
|
result = vmlaq_f32(result, mid, k1); // mid
|
|
result = vmlaq_f32(result, vextq_f32(mid, mid, 1), k1); // mid-right
|
|
result = vmlaq_f32(result, bot, k0); // bot-left
|
|
result = vmlaq_f32(result, vextq_f32(bot, bot, 1), k1); // bot
|
|
result = vmlaq_f32(result, vextq_f32(bot, bot, 2), k2); // bot-right
|
|
|
|
// 存储结果到dst
|
|
vst1q_f32(&dst[i][j], result);
|
|
}
|
|
}
|
|
}
|
|
|
|
int main() {
|
|
float inputImage[H][W] = {0};
|
|
float dst[H][W] = {0};
|
|
float kernel[3][3] = {
|
|
{1.0f/16, 2.0f/16, 1.0f/16},
|
|
{2.0f/16, 4.0f/16, 2.0f/16},
|
|
{1.0f/16, 2.0f/16, 1.0f/16}
|
|
};
|
|
|
|
// Initialize inputImage with some values
|
|
for (int i = 0; i < H; i++) {
|
|
for (int j = 0; j < W; j++) {
|
|
inputImage[i][j] = i * W + j;
|
|
}
|
|
}
|
|
|
|
clock_t start = clock();
|
|
applyGaussianBlur(inputImage, dst, H, W, kernel);
|
|
clock_t end = clock();
|
|
double time_spent = double(end - start) / CLOCKS_PER_SEC;
|
|
printf("运行时间:%lf秒\ndst矩阵结果为\n", time_spent);
|
|
|
|
for (int i = 0; i < H; i++) {
|
|
for (int j = 0; j < W; j++) {
|
|
printf("%f ", dst[i][j]);
|
|
}
|
|
printf("\n");
|
|
}
|
|
|
|
return 0;
|
|
} |