diff --git a/step6.c b/step6.c new file mode 100644 index 0000000..0daf89e --- /dev/null +++ b/step6.c @@ -0,0 +1,39 @@ +#include +#include +#include +#include +#define SIZE 1024 +#define BLOCK_SIZE 4 +void matmul_optimized(float** A, float** B, float** C, int n) { + for (int i = 0; i < n; i += BLOCK_SIZE) { + for (int j = 0; j < n; j += BLOCK_SIZE) { + float32x4_t vecC[BLOCK_SIZE][BLOCK_SIZE] = {0}; + for (int k = 0; k < n; k += BLOCK_SIZE) { + for (int ii = 0; ii < BLOCK_SIZE && i + ii < n; ii++) { + for (int jj = 0; jj < BLOCK_SIZE && k + jj < n; jj++) { + float32x4_t vecA = vld1q_f32(&A[i + ii][k]); + float32x4_t vecB = vld1q_f32(&B[k + jj][j]); + for (int l = 0; l < BLOCK_SIZE && j + l < n; l++) { + vecC[ii][l] = vmlaq_f32(vecC[ii][l], vecA, vecB); + vecB = vld1q_f32(&B[k + jj][j + BLOCK_SIZE + l]); + } + } + } + } + for (int ii = 0; ii < BLOCK_SIZE && i + ii < n; ii++) { + for (int jj = 0; jj < BLOCK_SIZE && j + jj < n; jj++) { + vst1q_f32(&C[i + ii][j + jj], vecC[ii][jj]); + } + } + } + } +} +void sparse_to_dense(SparseMatrix* sparse, float denseMatrix[ROWS][COLS], int rows, int cols) { + memset(denseMatrix, 0, sizeof(float) * rows * cols); + for (int i = 0; i < sparse->num_elements; i++) { + int row = sparse->elements[i].row; + int col = sparse->elements[i].col; + float value = sparse->elements[i].value; + denseMatrix[row][col] = value; + } +}