parent
f93d42c58a
commit
27dd9a6fe4
@ -0,0 +1,68 @@
|
||||
#include <stdio.h>
|
||||
#include <ctime>
|
||||
#include <stdlib.h>
|
||||
#include <arm_neon.h>
|
||||
|
||||
#define H 5
|
||||
#define W 5
|
||||
|
||||
void applyGaussianBlur(float src[H][W], float dst[H][W], int h, int w, float kernel[3][3]) {
|
||||
// 将卷积核的值加载到 NEON 寄存器中
|
||||
float32x4_t k0 = vdupq_n_f32(kernel[0][0] * kernel[1][1] * kernel[2][2]);
|
||||
float32x4_t k1 = vdupq_n_f32(kernel[0][1] * kernel[1][1] * kernel[2][1]);
|
||||
float32x4_t k2 = vdupq_n_f32(kernel[0][2] * kernel[1][1] * kernel[2][0]);
|
||||
|
||||
for (int i = 1; i < h - 1; ++i) {
|
||||
for (int j = 1; j <= w - 3; j += 4) { // 每次处理4个像素
|
||||
// 加载src中的像素到NEON寄存器
|
||||
float32x4_t top = vld1q_f32(&src[i-1][j-1]);
|
||||
float32x4_t mid = vld1q_f32(&src[i][j-1]);
|
||||
float32x4_t bot = vld1q_f32(&src[i+1][j-1]);
|
||||
|
||||
// 执行卷积操作
|
||||
float32x4_t result = vmulq_f32(top, k0); // top-left
|
||||
result = vmlaq_f32(result, vextq_f32(top, top, 1), k1); // top
|
||||
result = vmlaq_f32(result, vextq_f32(top, top, 2), k2); // top-right
|
||||
result = vmlaq_f32(result, mid, k1); // mid
|
||||
result = vmlaq_f32(result, vextq_f32(mid, mid, 1), k1); // mid-right
|
||||
result = vmlaq_f32(result, bot, k0); // bot-left
|
||||
result = vmlaq_f32(result, vextq_f32(bot, bot, 1), k1); // bot
|
||||
result = vmlaq_f32(result, vextq_f32(bot, bot, 2), k2); // bot-right
|
||||
|
||||
// 存储结果到dst
|
||||
vst1q_f32(&dst[i][j], result);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int main() {
|
||||
float inputImage[H][W] = {0};
|
||||
float dst[H][W] = {0};
|
||||
float kernel[3][3] = {
|
||||
{1.0f/16, 2.0f/16, 1.0f/16},
|
||||
{2.0f/16, 4.0f/16, 2.0f/16},
|
||||
{1.0f/16, 2.0f/16, 1.0f/16}
|
||||
};
|
||||
|
||||
// Initialize inputImage with some values
|
||||
for (int i = 0; i < H; i++) {
|
||||
for (int j = 0; j < W; j++) {
|
||||
inputImage[i][j] = i * W + j;
|
||||
}
|
||||
}
|
||||
|
||||
clock_t start = clock();
|
||||
applyGaussianBlur(inputImage, dst, H, W, kernel);
|
||||
clock_t end = clock();
|
||||
double time_spent = double(end - start) / CLOCKS_PER_SEC;
|
||||
printf("运行时间:%lf秒\ndst矩阵结果为\n", time_spent);
|
||||
|
||||
for (int i = 0; i < H; i++) {
|
||||
for (int j = 0; j < W; j++) {
|
||||
printf("%f ", dst[i][j]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
Loading…
Reference in new issue