#include #include #include #include typedef struct { int* values; int* rowIndex; int* colIndex; int nonZeroCount; int rows; int cols; } SparseMatrix; void initSparseMatrix(SparseMatrix* mat, int rows, int cols, int nonZeroCount) { mat->rows = rows; mat->cols = cols; mat->nonZeroCount = nonZeroCount; mat->values = (int*)malloc(nonZeroCount * sizeof(int)); mat->rowIndex = (int*)malloc(nonZeroCount * sizeof(int)); mat->colIndex = (int*)malloc(nonZeroCount * sizeof(int)); } void printSparseMatrix(SparseMatrix* mat) { printf("Values: "); for (int i = 0; i < mat->nonZeroCount; i++) { printf("%d ", mat->values[i]); } printf("\nRow indices: "); for (int i = 0; i < mat->nonZeroCount; i++) { printf("%d ", mat->rowIndex[i]); } printf("\nColumn indices: "); for (int i = 0; i < mat->nonZeroCount; i++) { printf("%d ", mat->colIndex[i]); } printf("\n"); } void sparse_matmul_optimized(SparseMatrix* A, SparseMatrix* B, float* C, int resultRows, int resultCols) { for (int i = 0; i < resultRows * resultCols; i++) { C[i] = 0.0f; } for (int i = 0; i < A->nonZeroCount; i++) { int rowA = A->rowIndex[i]; int colA = A->colIndex[i]; int valueA = A->values[i]; for (int j = 0; j < B->nonZeroCount; j++) { int rowB = B->rowIndex[j]; int colB = B->colIndex[j]; int valueB = B->values[j]; if (colA == rowB) { float32x4_t vecA = vdupq_n_f32(valueA); float32x4_t vecB = vdupq_n_f32(valueB); float32x4_t vecC = vld1q_f32(&C[rowA * resultCols + colB]); vecC = vmlaq_f32(vecC, vecA, vecB); vst1q_f32(&C[rowA * resultCols + colB], vecC); } } } } int main() { SparseMatrix A, B; int nonZeroCountA = 4, nonZeroCountB = 4; initSparseMatrix(&A, 3, 3, nonZeroCountA); A.values[0] = 5; A.rowIndex[0] = 0; A.colIndex[0] = 0; A.values[1] = 8; A.rowIndex[1] = 0; A.colIndex[1] = 2; A.values[2] = 3; A.rowIndex[2] = 1; A.colIndex[2] = 1; A.values[3] = 6; A.rowIndex[3] = 2; A.colIndex[3] = 0; initSparseMatrix(&B, 3, 3, nonZeroCountB); B.values[0] = 2; B.rowIndex[0] = 0; B.colIndex[0] = 1; B.values[1] = 7; B.rowIndex[1] = 1; B.colIndex[1] = 0; B.values[2] = 4; B.rowIndex[2] = 2; B.colIndex[2] = 2; B.values[3] = 9; B.rowIndex[3] = 2; B.colIndex[3] = 1; printf("Matrix A:\n"); printSparseMatrix(&A); printf("\nMatrix B:\n"); printSparseMatrix(&B); int resultRows = A.rows; int resultCols = B.cols; float* C = (float*)malloc(resultRows * resultCols * sizeof(float)); clock_t start = clock(); sparse_matmul_optimized(&A, &B, C, resultRows, resultCols); clock_t end = clock(); printf("\nMatrix C (Result):\n"); for (int i = 0; i < resultRows; i++) { for (int j = 0; j < resultCols; j++) { printf("%.2f ", C[i * resultCols + j]); } printf("\n"); } double time_taken = (double)(end - start) / CLOCKS_PER_SEC; printf("\nSparse Matrix Multiplication Time: %f seconds\n", time_taken); free(A.values); free(A.rowIndex); free(A.colIndex); free(B.values); free(B.rowIndex); free(B.colIndex); free(C); return 0; }