You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
40 lines
1.6 KiB
40 lines
1.6 KiB
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <time.h>
|
|
#include <arm_neon.h>
|
|
#define SIZE 1024
|
|
#define BLOCK_SIZE 4
|
|
void matmul_optimized(float** A, float** B, float** C, int n) {
|
|
for (int i = 0; i < n; i += BLOCK_SIZE) {
|
|
for (int j = 0; j < n; j += BLOCK_SIZE) {
|
|
float32x4_t vecC[BLOCK_SIZE][BLOCK_SIZE] = {0};
|
|
for (int k = 0; k < n; k += BLOCK_SIZE) {
|
|
for (int ii = 0; ii < BLOCK_SIZE && i + ii < n; ii++) {
|
|
for (int jj = 0; jj < BLOCK_SIZE && k + jj < n; jj++) {
|
|
float32x4_t vecA = vld1q_f32(&A[i + ii][k]);
|
|
float32x4_t vecB = vld1q_f32(&B[k + jj][j]);
|
|
for (int l = 0; l < BLOCK_SIZE && j + l < n; l++) {
|
|
vecC[ii][l] = vmlaq_f32(vecC[ii][l], vecA, vecB);
|
|
vecB = vld1q_f32(&B[k + jj][j + BLOCK_SIZE + l]);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
for (int ii = 0; ii < BLOCK_SIZE && i + ii < n; ii++) {
|
|
for (int jj = 0; jj < BLOCK_SIZE && j + jj < n; jj++) {
|
|
vst1q_f32(&C[i + ii][j + jj], vecC[ii][jj]);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
void sparse_to_dense(SparseMatrix* sparse, float denseMatrix[ROWS][COLS], int rows, int cols) {
|
|
memset(denseMatrix, 0, sizeof(float) * rows * cols);
|
|
for (int i = 0; i < sparse->num_elements; i++) {
|
|
int row = sparse->elements[i].row;
|
|
int col = sparse->elements[i].col;
|
|
float value = sparse->elements[i].value;
|
|
denseMatrix[row][col] = value;
|
|
}
|
|
}
|