You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
42 lines
1.4 KiB
42 lines
1.4 KiB
#include <arm_neon.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <time.h>
|
|
|
|
#define KERNEL_SIZE 5
|
|
#define IMAGE_SIZE 3
|
|
|
|
void applyRowConvolution_NEON(int input[IMAGE_SIZE][IMAGE_SIZE], int temp[IMAGE_SIZE][IMAGE_SIZE], float row_kernel[KERNEL_SIZE]) {
|
|
float32x4_t v_row_kernel = vld1q_f32(row_kernel);
|
|
float32x4_t v_sum = vdupq_n_f32(256.0);
|
|
|
|
for (int i = 0; i < IMAGE_SIZE; i++) {
|
|
for (int j = 1; j < IMAGE_SIZE - 1; j++) {
|
|
float32x4_t v_blurred_value = vdupq_n_f32(0.0);
|
|
for (int ki = 0; ki < KERNEL_SIZE - 1; ki += 4) {
|
|
int x = i;
|
|
int y = j - KERNEL_SIZE / 2 + ki;
|
|
float32x4_t v_input = vld1q_f32((float *)&input[x][y]);
|
|
v_blurred_value = vmlaq_f32(v_blurred_value, v_input, v_row_kernel);
|
|
}
|
|
float32x4_t v_result = vdivq_f32(v_blurred_value, v_sum);
|
|
temp[i][j] = (int)vgetq_lane_f32(v_result, 0);
|
|
}
|
|
}
|
|
}
|
|
|
|
void applyColumnConvolution_NEON(int temp[IMAGE_SIZE][IMAGE_SIZE], int output[IMAGE_SIZE][IMAGE_SIZE], float col_kernel[KERNEL_SIZE]) {
|
|
float32x4_t v_col_kernel = vld1q_f32(col_kernel);
|
|
float32x4_t v_sum = vdupq_n_f32(256.0);
|
|
|
|
for (int i = 1; i < IMAGE_SIZE - 1; i++) {
|
|
for (int j = 0; j < IMAGE_SIZE; j++) {
|
|
float32x4_t v_blurred_value = vdupq_n_f32(0.0);
|
|
for (int kj = 0; kj < KERNEL_SIZE - 1; kj += 4) {
|
|
int x = i - KERNEL_SIZE / 2 + kj;
|
|
int y = j;
|
|
float32x4_t v_input = vld1q_f32((float *)&temp[x][y]);
|
|
v_blurred_value = vmlaq_f32(v_blurred_value, v_input, v_col_kernel);
|
|
}
|
|
float32x4_t v_result = vdiv
|