diff --git a/opcomplex.cpp b/opcomplex.cpp new file mode 100644 index 0000000..6a7019a --- /dev/null +++ b/opcomplex.cpp @@ -0,0 +1,140 @@ +#include +#include +#include + +#define IDx(n) ((n) % 3) + +#define H 600 +#define W 600 + + +void applyGaussianBlur(float src[][W], float dst[][W], int h, int w, float kernel[3][3]) { + for (int i = 1; i < h -1;++i) { + for (int j = 1; j < w - 1; ++j) { + dst[i][j] =src[i - 1][j - 1] * kernel[0][0] + src[i - 1][j] * kernel[0][1] + src[i - 1][j + 1] * kernel[0][2] + + src[i][j - 1] * kernel[1][0] + src[i][j] * kernel[1][1] + src[i][j + 1] * kernel[1][2] + + src[i + 1][j - 1] * kernel[2][0] + src[i + 1][j] * kernel[2][1] + src[i + 1][j + 1] * kernel[2][2]; + } + } +} + +void applySeparableGaussianBlur(float src[][W], float dst[][W], int h, int w, float kx[3], float ky[3]) { + float buf[3][W+3]; + + for (int i = 0; i < 2; ++i) { + for (int j = 1; j< w - 1; ++j) { + buf[i][j] = src[i][j - 1] * kx[0] + src[i][j] * kx[1] + src[i][j + 1] * kx[2]; + } + } + + for (int i = 1; i < h - 1; ++i) { + //计算当前行的行内卷积 + for (int j = 1; j< w- 1;++j) { + buf[IDx(i + 1)][j] = src[i + 1][j - 1] * kx[0] + src[i + 1][j] * kx[1] + src[i + 1][j + 1] * kx[2]; + } + //进行行间的卷积得到最终像素值 + for (int j = 1; j< w - 1; ++j) { + dst[i][j] = buf[IDx(i - 1)][j] * ky[0] + buf[IDx(i)][j] * ky[1] + buf[IDx(i + 1)][j] * ky[2]; + } + } +} + +void applyOptimizedSeparableGaussianBlur(float src[][W], float dst[][W], int h, int w, float kx[3], float ky[3]) { + + float buf[3][W+3]; + + float32x4_t kx_vec = vld1q_f32(kx); // 加载 kx + float32x4_t ky_vec = vld1q_f32(ky); // 加载 ky + for (int i = 0; i < 2; ++i) { + for (int j = 1; j < w - 1; j += 4) { + float32x4_t left = vld1q_f32(&src[i][j - 1]); + float32x4_t mid = vld1q_f32(&src[i][j]); + float32x4_t right = vld1q_f32(&src[i][j + 1]); + + float32x4_t result = vmulq_lane_f32(left, vget_low_f32(kx_vec), 0); // kx[0] * left + result = vmlaq_lane_f32(result, mid, vget_low_f32(kx_vec), 1);// + kx[1] * mid + result = vmlaq_lane_f32(result, right, vget_high_f32(kx_vec), 0);// + kx[2] * right + + vst1q_f32(&buf[i][j], result); + } + } + + for (int i = 1; i < h - 1; ++i) { + //计算当前行的行内卷积 + for (int j = 1; j< w- 1;++j) { + buf[IDx(i + 1)][j] = src[i + 1][j - 1] * kx[0] + src[i + 1][j] * kx[1] + src[i + 1][j + 1] * kx[2]; + /* + float32x4_t left = vld1q_f32(&src[i+1][j - 1]); + float32x4_t mid = vld1q_f32(&src[i+1][j]); + float32x4_t right = vld1q_f32(&src[i + 1][j + 1]); + + float32x4_t result = vmulq_lane_f32(left, vget_low_f32(kx_vec), 0); // kx[0] * left + result = vmlaq_lane_f32(result, mid, vget_low_f32(kx_vec), 1);// + kx[1] * mid + result = vmlaq_lane_f32(result, right, vget_high_f32(kx_vec), 0);// + kx[2] * right + + vst1q_f32(&buf[IDx(i + 1)][j], result); + */ + } + //进行行间的卷积得到最终像素值 + for (int j = 1; j< w - 1; ++j) { + + + dst[i][j] = buf[IDx(i - 1)][j] * ky[0] + buf[IDx(i)][j] * ky[1] + buf[IDx(i + 1)][j] * ky[2]; + /* + float32x4_t left = vld1q_f32(&buf[IDx(i - 1)][j]); + float32x4_t mid = vld1q_f32(&buf[IDx(i)][j]); + float32x4_t right = vld1q_f32(&buf[IDx(i + 1)][j]); + + float32x4_t result = vmulq_lane_f32(left, vget_low_f32(ky_vec), 0); // kx[0] * left + result = vmlaq_lane_f32(result, mid, vget_low_f32(ky_vec), 1);// + kx[1] * mid + result = vmlaq_lane_f32(result, right, vget_high_f32(ky_vec), 0);// + kx[2] * right + + vst1q_f32(&dst[i][j], result); + */ + } + } + + +} + +int main() { + float src_t[H][W]={0}; + float dst1[H][W]={0}; + float dst2[H][W]={0}; + float dst3[H][W]={0}; + + float kernel[3][3] = { + {1.0f / 16, 2.0f / 16, 1.0f / 16}, + {2.0f / 16, 4.0f / 16, 2.0f / 16}, + {1.0f / 16, 2.0f / 16, 1.0f / 16} + }; + + float kx[3] = {0.25, 0.5, 0.25}; + float ky[3] = {0.25, 0.5, 0.25}; + + for (auto & i : src_t) { + for (float & j : i) { + srand((unsigned)time(NULL)); + j=0.01*rand(); + } + } + + clock_t start = clock(); + applyGaussianBlur(src_t, dst1, H, W, kernel); + clock_t end = clock(); + printf("耗时%lf秒\n",(double)(end-start)/CLOCKS_PER_SEC); + + start = clock(); + applySeparableGaussianBlur(src_t, dst2, H, W, kx,ky); + end = clock(); + printf("耗时%lf秒\n",(double)(end-start)/CLOCKS_PER_SEC); + + start = clock(); + applyOptimizedSeparableGaussianBlur(src_t, dst3, H, W, kx,ky); + end = clock(); + printf("耗时%lf秒\n",(double)(end-start)/CLOCKS_PER_SEC); + + + return 0; +} +