Merge pull request '把数据扩大,并做了一些优化' (#2) from p8sljnpht/opcomplex:main into main

main
pi7mcrg2k 8 months ago
commit 783f0bf04d

@ -0,0 +1,33 @@
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include "render.h"
void applyGaussianBlur(float src[][MAT_SIZE], float dst[][MAT_SIZE], float kernel[3][3]);
int main() {
float inputImage[MAT_SIZE][MAT_SIZE];
Render(inputImage);
float kernel[3][3] = {
{1.0f/16, 2.0f/16, 1.0f/16},
{2.0f/16, 4.0f/16, 2.0f/16},
{1.0f/16, 2.0f/16, 1.0f/16}
};
float outputImage[MAT_SIZE][MAT_SIZE]={0};
clock_t start = clock();
applyGaussianBlur(inputImage, outputImage, kernel);
clock_t end = clock();
printf("Time: %lf s\n", (double)(end-start) / CLOCKS_PER_SEC);
Print(outputImage);
}
void applyGaussianBlur(float src[][MAT_SIZE], float dst[][MAT_SIZE], float kernel[3][3]) {
int i, j;
for(i=1; i<MAT_SIZE-1; i++)
for(j=1; j<MAT_SIZE-1; j++) {
dst[i][j] =
src[i-1][j-1]*kernel[0][0]+src[i-1][j ]*kernel[0][1]+src[i-1][j+1]*kernel[0][2]+
src[i ][j-1]*kernel[1][0]+src[i ][j ]*kernel[1][1]+src[i ][j+1]*kernel[1][2]+
src[i+1][j-1]*kernel[2][0]+src[i+1][j ]*kernel[2][1]+src[i+1][j+1]*kernel[2][2];
}
}

@ -0,0 +1,38 @@
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include "render.h"
#define IDX(n) ((n) % 3)
void applySeparableGaussianBlur(float[][MAT_SIZE], float[][MAT_SIZE], float[3], float[3]);
int main() {
float inputImage[MAT_SIZE][MAT_SIZE];
Render(inputImage);
float kx[3] = {1.0f/4, 1.0f/2, 1.0f/4};
float ky[3] = {1.0f/4, 1.0f/2, 1.0f/4};
float outputImage[MAT_SIZE][MAT_SIZE] = {0};
clock_t start = clock();
applySeparableGaussianBlur(inputImage, outputImage, kx, ky);
clock_t end = clock();
printf("Time: %lf s\n", (double)(end-start) / CLOCKS_PER_SEC);
Print(outputImage);
}
void applySeparableGaussianBlur(float src[][MAT_SIZE], float dst[][MAT_SIZE], float kx[3], float ky[3]) {
float buf[3][MAT_SIZE+3];
int i, j;
// 计算前两行的行内卷积
for(i=0; i<2; i++)
for(j=1; j<MAT_SIZE-1; j++) {
buf[i][j] = src[i][j-1]*kx[0]+src[i][j]*kx[1]+src[i][j+1]*kx[2];
}
// 利用buf进行累计
for(i=1; i<MAT_SIZE-1; i++) {
for(j=1; j<MAT_SIZE-1; j++)
buf[IDX(i+1)][j] = src[i+1][j-1]*kx[0]+src[i+1][j]*kx[1]+src[i+1][j+1]*kx[2];
for(j=1; j<MAT_SIZE-1; j++)
dst[i][j] = buf[IDX(i-1)][j]*ky[0]+buf[IDX(i)][j]*ky[1]+buf[IDX(i+1)][j]*ky[2];
}
}

@ -0,0 +1,61 @@
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <arm_neon.h>
#include "render.h"
#define IDX(n) ((n) % 3)
void applySeparableGaussianBlur(float src[][MAT_SIZE], float dst[][MAT_SIZE],
float kx[], float ky[]);
int main() {
float inputImage[MAT_SIZE][MAT_SIZE];
Render(inputImage);
float kernel[3][3] = {
{1.0f/16, 2.0f/16, 1.0f/16},
{2.0f/16, 4.0f/16, 2.0f/16},
{1.0f/16, 2.0f/16, 1.0f/16}
};
float kx[4] = {1.0f/4, 1.0f/2, 1.0f/4, 0.0f}; // 防止越界多定义一个
float ky[4] = {1.0f/4, 1.0f/2, 1.0f/4, 0.0f};
float outputImage[MAT_SIZE][MAT_SIZE] = {0};
clock_t start = clock();
applySeparableGaussianBlur(inputImage, outputImage, kx, ky);
clock_t end = clock();
printf("Time: %lf s\n", (double)(end-start) / CLOCKS_PER_SEC);
Print(outputImage);
}
void applySeparableGaussianBlur(float src[][MAT_SIZE], float dst[][MAT_SIZE],
float kx[], float ky[]) {
int i, j;
float buf[3][MAT_SIZE];
float32x4_t kx_vec = vld1q_f32(kx);
float32x4_t ky_vec = vld1q_f32(ky);
// 计算前两行的行内卷积
float32x4_t* left, *mid, *right, *result;
for(i=0; i<2; i++)
for(j=1; j<MAT_SIZE-1; j+=4) {
left = (float32x4_t*)&src[i][j-1];
mid = (float32x4_t*)&src[i][j];
right = (float32x4_t*)&src[i][j+1];
result = (float32x4_t*)&buf[i][j];
*result = vmulq_lane_f32(*left, vget_low_f32(kx_vec), 0);
*result += vmulq_lane_f32(*mid, vget_low_f32(kx_vec), 1);
*result += vmulq_lane_f32(*right, vget_high_f32(kx_vec), 0);
}
// 利用buf进行累计
for(i=1; i<MAT_SIZE-1; i++) {
for(j=1; j<MAT_SIZE-1; j+=4) {
left = (float32x4_t*)&src[i+1][j-1];
mid = (float32x4_t*)&src[i+1][j];
right = (float32x4_t*)&src[i+1][j+1];
result = (float32x4_t*)&buf[IDX(i+1)][j];
*result = vmulq_lane_f32(*left, vget_low_f32(kx_vec), 0);
*result += vmulq_lane_f32(*mid, vget_low_f32(kx_vec), 1);
*result += vmulq_lane_f32(*right, vget_high_f32(kx_vec), 0);
}
for(j=1; j<MAT_SIZE-1; j++)
dst[i][j] = buf[IDX(i-1)][j]*ky[0]+buf[IDX(i)][j]*ky[1]+buf[IDX(i+1)][j]*ky[2];
}
}

@ -0,0 +1,40 @@
#ifndef __RENDER_H
#define __RENDER_H
#pragma GCC optimize ("O1")
#include <stdlib.h>
#include <time.h>
#define MAT_SIZE 514
#define RAND_SEED 114514
void Render(float a[][MAT_SIZE]);
void Print(float a[][MAT_SIZE]);
void Render(float a[][MAT_SIZE])
{
srand(RAND_SEED);
int i, j;
for(i=0; i<MAT_SIZE; i++)
for(j=0; j<MAT_SIZE; j++) {
a[i][j] = (float)rand() / (float)RAND_MAX;
}
}
void Print(float a[][MAT_SIZE])
{
printf("Matrix with Size=(%d,%d)\n", MAT_SIZE, MAT_SIZE);
printf("%5.1f %5.1f %5.1f ... %5.1f %5.1f %5.1f\n", a[0][0], a[0][1], a[0][2],
a[0][MAT_SIZE-3], a[0][MAT_SIZE-2], a[0][MAT_SIZE-1]);
printf("%5.1f %5.1f %5.1f ... %5.1f %5.1f %5.1f\n", a[1][0], a[1][1], a[1][2],
a[1][MAT_SIZE-3], a[1][MAT_SIZE-2], a[1][MAT_SIZE-1]);
printf("%5.1f %5.1f %5.1f ... %5.1f %5.1f %5.1f\n", a[2][0], a[2][1], a[2][2],
a[2][MAT_SIZE-3], a[2][MAT_SIZE-2], a[2][MAT_SIZE-1]);
printf(" ... ... ... ... ... ... ... \n");
printf("%5.1f %5.1f %5.1f ... %5.1f %5.1f %5.1f\n", a[MAT_SIZE-3][0], a[MAT_SIZE-3][1], a[MAT_SIZE-3][2],
a[MAT_SIZE-3][MAT_SIZE-3], a[MAT_SIZE-3][MAT_SIZE-2], a[MAT_SIZE-3][MAT_SIZE-1]);
printf("%5.1f %5.1f %5.1f ... %5.1f %5.1f %5.1f\n", a[MAT_SIZE-2][0], a[MAT_SIZE-2][1], a[MAT_SIZE-2][2],
a[MAT_SIZE-2][MAT_SIZE-3], a[MAT_SIZE-2][MAT_SIZE-2], a[MAT_SIZE-2][MAT_SIZE-1]);
printf("%5.1f %5.1f %5.1f ... %5.1f %5.1f %5.1f\n", a[MAT_SIZE-1][0], a[MAT_SIZE-1][1], a[MAT_SIZE-1][2],
a[MAT_SIZE-1][MAT_SIZE-3], a[MAT_SIZE-1][MAT_SIZE-2], a[MAT_SIZE-1][MAT_SIZE-1]);
}
#endif
Loading…
Cancel
Save