Merge pull request '把数据扩大，并做了一些优化' (#2) from p8sljnpht/opcomplex:main into main

2 years ago · 783f0bf04d
parent 046c4f7072 5f36cdfd4c
commit 783f0bf04d
4 changed files with 172 additions and 0 deletions
--- a/cxy_opt/1.c
+++ b/cxy_opt/1.c
@ -0,0 +1,33 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <string.h>
+#include "render.h"
+
+void applyGaussianBlur(float src[][MAT_SIZE], float dst[][MAT_SIZE], float kernel[3][3]);
+int main() {
+	float inputImage[MAT_SIZE][MAT_SIZE];
+	Render(inputImage);
+	float kernel[3][3] = {
+		{1.0f/16, 2.0f/16, 1.0f/16},
+		{2.0f/16, 4.0f/16, 2.0f/16},
+		{1.0f/16, 2.0f/16, 1.0f/16}
+	};
+	float outputImage[MAT_SIZE][MAT_SIZE]={0};
+	clock_t start = clock();
+	applyGaussianBlur(inputImage, outputImage, kernel);
+	clock_t end   = clock();
+	printf("Time: %lf s\n", (double)(end-start) / CLOCKS_PER_SEC);
+	Print(outputImage);
+}
+
+void applyGaussianBlur(float src[][MAT_SIZE], float dst[][MAT_SIZE], float kernel[3][3]) {
+	int i, j;
+	for(i=1; i<MAT_SIZE-1; i++)
+	for(j=1; j<MAT_SIZE-1; j++) {
+		dst[i][j] = 
+		src[i-1][j-1]*kernel[0][0]+src[i-1][j  ]*kernel[0][1]+src[i-1][j+1]*kernel[0][2]+
+		src[i  ][j-1]*kernel[1][0]+src[i  ][j  ]*kernel[1][1]+src[i  ][j+1]*kernel[1][2]+
+		src[i+1][j-1]*kernel[2][0]+src[i+1][j  ]*kernel[2][1]+src[i+1][j+1]*kernel[2][2];
+	}
+}
--- a/cxy_opt/2.c
+++ b/cxy_opt/2.c
@ -0,0 +1,38 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <string.h>
+#include "render.h"
+#define IDX(n) ((n) % 3)
+
+void applySeparableGaussianBlur(float[][MAT_SIZE], float[][MAT_SIZE], float[3], float[3]);
+int main() {
+	float inputImage[MAT_SIZE][MAT_SIZE];
+	Render(inputImage);
+
+	float kx[3] = {1.0f/4, 1.0f/2, 1.0f/4};
+	float ky[3] = {1.0f/4, 1.0f/2, 1.0f/4};
+	float outputImage[MAT_SIZE][MAT_SIZE] = {0};
+	clock_t start = clock();
+	applySeparableGaussianBlur(inputImage, outputImage, kx, ky);
+	clock_t end   = clock();
+	printf("Time: %lf s\n", (double)(end-start) / CLOCKS_PER_SEC);
+	Print(outputImage);
+}
+
+void applySeparableGaussianBlur(float src[][MAT_SIZE], float dst[][MAT_SIZE], float kx[3], float ky[3]) {
+	float buf[3][MAT_SIZE+3];
+	int i, j;
+	// 计算前两行的行内卷积
+	for(i=0; i<2; i++)
+	for(j=1; j<MAT_SIZE-1; j++) {
+		buf[i][j] = src[i][j-1]*kx[0]+src[i][j]*kx[1]+src[i][j+1]*kx[2];
+	}
+	// 利用buf进行累计
+	for(i=1; i<MAT_SIZE-1; i++) {
+		for(j=1; j<MAT_SIZE-1; j++)
+			buf[IDX(i+1)][j] = src[i+1][j-1]*kx[0]+src[i+1][j]*kx[1]+src[i+1][j+1]*kx[2];
+		for(j=1; j<MAT_SIZE-1; j++)
+			dst[i][j] = buf[IDX(i-1)][j]*ky[0]+buf[IDX(i)][j]*ky[1]+buf[IDX(i+1)][j]*ky[2];
+	}
+}
--- a/cxy_opt/3.c
+++ b/cxy_opt/3.c
@ -0,0 +1,61 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <string.h>
+#include <arm_neon.h>
+#include "render.h"
+#define IDX(n) ((n) % 3)
+
+void applySeparableGaussianBlur(float src[][MAT_SIZE], float dst[][MAT_SIZE], 
+	 float kx[], float ky[]);
+int main() {
+	float inputImage[MAT_SIZE][MAT_SIZE];
+	Render(inputImage);
+	float kernel[3][3] = {
+		{1.0f/16, 2.0f/16, 1.0f/16},
+		{2.0f/16, 4.0f/16, 2.0f/16},
+		{1.0f/16, 2.0f/16, 1.0f/16}
+	};
+	float kx[4] = {1.0f/4, 1.0f/2, 1.0f/4, 0.0f}; // 防止越界多定义一个
+	float ky[4] = {1.0f/4, 1.0f/2, 1.0f/4, 0.0f};
+	float outputImage[MAT_SIZE][MAT_SIZE] = {0};
+	clock_t start = clock();
+	applySeparableGaussianBlur(inputImage, outputImage, kx, ky);
+	clock_t end   = clock();
+	printf("Time: %lf s\n", (double)(end-start) / CLOCKS_PER_SEC);
+	Print(outputImage);
+}
+
+void applySeparableGaussianBlur(float src[][MAT_SIZE], float dst[][MAT_SIZE], 
+		float kx[], float ky[]) {
+	int i, j;
+	float buf[3][MAT_SIZE];
+	float32x4_t kx_vec = vld1q_f32(kx);
+	float32x4_t ky_vec = vld1q_f32(ky);
+	// 计算前两行的行内卷积
+	float32x4_t* left, *mid, *right, *result;
+	for(i=0; i<2; i++)
+	for(j=1; j<MAT_SIZE-1; j+=4) {
+		left   = (float32x4_t*)&src[i][j-1];
+		mid    = (float32x4_t*)&src[i][j];
+		right  = (float32x4_t*)&src[i][j+1];
+		result = (float32x4_t*)&buf[i][j];
+		*result  = vmulq_lane_f32(*left,  vget_low_f32(kx_vec), 0);
+		*result += vmulq_lane_f32(*mid,   vget_low_f32(kx_vec), 1);
+		*result += vmulq_lane_f32(*right, vget_high_f32(kx_vec), 0);
+	}
+	// 利用buf进行累计
+	for(i=1; i<MAT_SIZE-1; i++) {
+		for(j=1; j<MAT_SIZE-1; j+=4) {
+			left   = (float32x4_t*)&src[i+1][j-1];
+			mid    = (float32x4_t*)&src[i+1][j];
+			right  = (float32x4_t*)&src[i+1][j+1];
+			result = (float32x4_t*)&buf[IDX(i+1)][j];
+			*result  = vmulq_lane_f32(*left,  vget_low_f32(kx_vec), 0);
+			*result += vmulq_lane_f32(*mid,   vget_low_f32(kx_vec), 1);
+			*result += vmulq_lane_f32(*right, vget_high_f32(kx_vec), 0);
+		}
+		for(j=1; j<MAT_SIZE-1; j++)
+			dst[i][j] = buf[IDX(i-1)][j]*ky[0]+buf[IDX(i)][j]*ky[1]+buf[IDX(i+1)][j]*ky[2];
+	}
+}
--- a/cxy_opt/render.h
+++ b/cxy_opt/render.h
@ -0,0 +1,40 @@
+#ifndef __RENDER_H
+#define __RENDER_H
+#pragma GCC optimize ("O1")
+#include <stdlib.h>
+#include <time.h>
+
+#define MAT_SIZE 514
+#define RAND_SEED 114514
+void Render(float a[][MAT_SIZE]);
+void Print(float a[][MAT_SIZE]);
+
+void Render(float a[][MAT_SIZE])
+{
+	srand(RAND_SEED);
+	int i, j;
+	for(i=0; i<MAT_SIZE; i++)
+	for(j=0; j<MAT_SIZE; j++) {
+		a[i][j] = (float)rand() / (float)RAND_MAX;
+	}
+}
+
+void Print(float a[][MAT_SIZE])
+{
+	printf("Matrix with Size=(%d,%d)\n", MAT_SIZE, MAT_SIZE);
+	printf("%5.1f %5.1f %5.1f  ...  %5.1f %5.1f %5.1f\n", a[0][0], a[0][1], a[0][2], 
+		a[0][MAT_SIZE-3], a[0][MAT_SIZE-2], a[0][MAT_SIZE-1]);
+	printf("%5.1f %5.1f %5.1f  ...  %5.1f %5.1f %5.1f\n", a[1][0], a[1][1], a[1][2], 
+		a[1][MAT_SIZE-3], a[1][MAT_SIZE-2], a[1][MAT_SIZE-1]);
+	printf("%5.1f %5.1f %5.1f  ...  %5.1f %5.1f %5.1f\n", a[2][0], a[2][1], a[2][2], 
+		a[2][MAT_SIZE-3], a[2][MAT_SIZE-2], a[2][MAT_SIZE-1]);
+	printf("   ...   ...   ...   ...   ...   ...   ... \n");
+	printf("%5.1f %5.1f %5.1f  ...  %5.1f %5.1f %5.1f\n", a[MAT_SIZE-3][0], a[MAT_SIZE-3][1], a[MAT_SIZE-3][2], 
+		a[MAT_SIZE-3][MAT_SIZE-3], a[MAT_SIZE-3][MAT_SIZE-2], a[MAT_SIZE-3][MAT_SIZE-1]);
+	printf("%5.1f %5.1f %5.1f  ...  %5.1f %5.1f %5.1f\n", a[MAT_SIZE-2][0], a[MAT_SIZE-2][1], a[MAT_SIZE-2][2], 
+		a[MAT_SIZE-2][MAT_SIZE-3], a[MAT_SIZE-2][MAT_SIZE-2], a[MAT_SIZE-2][MAT_SIZE-1]);
+	printf("%5.1f %5.1f %5.1f  ...  %5.1f %5.1f %5.1f\n", a[MAT_SIZE-1][0], a[MAT_SIZE-1][1], a[MAT_SIZE-1][2], 
+		a[MAT_SIZE-1][MAT_SIZE-3], a[MAT_SIZE-1][MAT_SIZE-2], a[MAT_SIZE-1][MAT_SIZE-1]);
+}
+#endif
+