parent
7ce6b2ad6b
commit
02d7ffebe2
@ -0,0 +1,110 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <time.h>
|
||||
#include <arm_neon.h>
|
||||
void applyGaussianBlur(float src[][100], float dst[][100], int h, int w, float kx[3], float ky[3]) {
|
||||
float temp[100][100];
|
||||
for (int y = 0; y < h; y++) {
|
||||
for (int x = 0; x < w; x++) {
|
||||
float sum = 0.0;
|
||||
for (int i = -1; i <= 1; i++) {
|
||||
int nx = x + i;
|
||||
if (nx >= 0 && nx < w) {
|
||||
sum += src[y][nx] * kx[i + 1];
|
||||
}
|
||||
}
|
||||
temp[y][x] = sum;
|
||||
}
|
||||
}
|
||||
for (int y = 0; y < h; y++) {
|
||||
for (int x = 0; x < w; x++) {
|
||||
float sum = 0.0;
|
||||
for (int i = -1; i <= 1; i++) {
|
||||
int ny = y + i;
|
||||
if (ny >= 0 && ny < h) {
|
||||
sum += temp[ny][x] * ky[i + 1];
|
||||
}
|
||||
}
|
||||
dst[y][x] = sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
void applySeparableGaussianBlur(float src[][100], float dst[][100], int h, int w, float kx[3], float ky[3]) {
|
||||
float temp[100][100];
|
||||
float32x4_t kx_vec = vdupq_n_f32(kx[1]);
|
||||
float32x4_t kx_left = vdupq_n_f32(kx[0]);
|
||||
float32x4_t kx_right = vdupq_n_f32(kx[2]);
|
||||
for (int y = 0; y < h; y++) {
|
||||
for (int x = 1; x < w - 1; x += 4) {
|
||||
float32x4_t src_vec = vld1q_f32(&src[y][x]);
|
||||
float32x4_t left_vec = vld1q_f32(&src[y][x - 1]);
|
||||
float32x4_t right_vec = vld1q_f32(&src[y][x + 1]);
|
||||
|
||||
float32x4_t result_vec = vmlaq_f32(vmlaq_f32(kx_vec * src_vec, kx_left, left_vec), kx_right, right_vec);
|
||||
|
||||
vst1q_f32(&temp[y][x], result_vec);
|
||||
}
|
||||
for (int x = 0; x < w; x++) {
|
||||
if (x == 0 || x == w - 1) {
|
||||
float sum = 0.0;
|
||||
for (int i = -1; i <= 1; i++) {
|
||||
int nx = x + i;
|
||||
if (nx >= 0 && nx < w) {
|
||||
sum += src[y][nx] * kx[i + 1];
|
||||
}
|
||||
}
|
||||
temp[y][x] = sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
kx_vec = vdupq_n_f32(ky[1]);
|
||||
kx_left = vdupq_n_f32(ky[0]);
|
||||
kx_right = vdupq_n_f32(ky[2]);
|
||||
for (int x = 0; x < w; x++) {
|
||||
for (int y = 1; y < h - 1; y += 4) {
|
||||
float32x4_t temp_vec = vld1q_f32(&temp[y][x]);
|
||||
float32x4_t up_vec = vld1q_f32(&temp[y - 1][x]);
|
||||
float32x4_t down_vec = vld1q_f32(&temp[y + 1][x]);
|
||||
|
||||
float32x4_t result_vec = vmlaq_f32(vmlaq_f32(kx_vec * temp_vec, kx_left, up_vec), kx_right, down_vec);
|
||||
|
||||
vst1q_f32(&dst[y][x], result_vec);
|
||||
}
|
||||
for (int y = 0; y < h; y++) {
|
||||
if (y == 0 || y == h - 1) {
|
||||
float sum = 0.0;
|
||||
for (int i = -1; i <= 1; i++) {
|
||||
int ny = y + i;
|
||||
if (ny >= 0 && ny < h) {
|
||||
sum += temp[ny][x] * ky[i + 1];
|
||||
}
|
||||
}
|
||||
dst[y][x] = sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
void benchmark(void (*func)(float[][100], float[][100], int, int, float[], float[]), float src[][100], float dst[][100], int h, int w, float kx[3], float ky[3]) {
|
||||
clock_t start = clock();
|
||||
func(src, dst, h, w, kx, ky);
|
||||
clock_t end = clock();
|
||||
double time_spent = (double)(end - start) / CLOCKS_PER_SEC;
|
||||
printf("Time spent: %f seconds\n", time_spent);
|
||||
}
|
||||
int main() {
|
||||
int h = 100, w = 100;
|
||||
float src[100][100];
|
||||
float dst[100][100];
|
||||
float kx[3] = {0.25, 0.5, 0.25};
|
||||
float ky[3] = {0.25, 0.5, 0.25};
|
||||
for (int i = 0; i < h; i++) {
|
||||
for (int j = 0; j < w; j++) {
|
||||
src[i][j] = ((float)rand()) / RAND_MAX;
|
||||
}
|
||||
}
|
||||
printf("Original Gaussian Blur:\n");
|
||||
benchmark(applyGaussianBlur, src, dst, h, w, kx, ky);
|
||||
printf("NEON Optimized Gaussian Blur:\n");
|
||||
benchmark(applySeparableGaussianBlur, src, dst, h, w, kx, ky);
|
||||
return 0;
|
||||
}
|
Loading…
Reference in new issue