#include <stdio.h>
#include <stdlib.h>
#include <arm_neon.h>
#include <time.h>

typedef struct {
    int* values;       
    int* rowIndex;     
    int* colIndex;     
    int nonZeroCount; 
    int rows;          
    int cols;        
} SparseMatrix;

void initSparseMatrix(SparseMatrix* mat, int rows, int cols, int nonZeroCount) {
    mat->rows = rows;
    mat->cols = cols;
    mat->nonZeroCount = nonZeroCount;

    mat->values = (int*)malloc(nonZeroCount * sizeof(int));
    mat->rowIndex = (int*)malloc(nonZeroCount * sizeof(int));
    mat->colIndex = (int*)malloc(nonZeroCount * sizeof(int));
}

void printSparseMatrix(SparseMatrix* mat) {
    printf("Values: ");
    for (int i = 0; i < mat->nonZeroCount; i++) {
        printf("%d ", mat->values[i]);
    }
    printf("\nRow indices: ");
    for (int i = 0; i < mat->nonZeroCount; i++) {
        printf("%d ", mat->rowIndex[i]);
    }
    printf("\nColumn indices: ");
    for (int i = 0; i < mat->nonZeroCount; i++) {
        printf("%d ", mat->colIndex[i]);
    }
    printf("\n");
}

void sparse_matmul_optimized(SparseMatrix* A, SparseMatrix* B, float* C, int resultRows, int resultCols) {
    for (int i = 0; i < resultRows * resultCols; i++) {
        C[i] = 0.0f;
    }

    for (int i = 0; i < A->nonZeroCount; i++) {
        int rowA = A->rowIndex[i];
        int colA = A->colIndex[i];
        int valueA = A->values[i];

        for (int j = 0; j < B->nonZeroCount; j++) {
            int rowB = B->rowIndex[j];
            int colB = B->colIndex[j];
            int valueB = B->values[j];

            if (colA == rowB) {
                float32x4_t vecA = vdupq_n_f32(valueA);  
                float32x4_t vecB = vdupq_n_f32(valueB);  
                float32x4_t vecC = vld1q_f32(&C[rowA * resultCols + colB]); 

                vecC = vmlaq_f32(vecC, vecA, vecB);

                vst1q_f32(&C[rowA * resultCols + colB], vecC);
            }
        }
    }
}

int main() {
    SparseMatrix A, B;
    int nonZeroCountA = 4, nonZeroCountB = 4;

    initSparseMatrix(&A, 3, 3, nonZeroCountA);
    A.values[0] = 5; A.rowIndex[0] = 0; A.colIndex[0] = 0;
    A.values[1] = 8; A.rowIndex[1] = 0; A.colIndex[1] = 2;
    A.values[2] = 3; A.rowIndex[2] = 1; A.colIndex[2] = 1;
    A.values[3] = 6; A.rowIndex[3] = 2; A.colIndex[3] = 0;

    initSparseMatrix(&B, 3, 3, nonZeroCountB);
    B.values[0] = 2; B.rowIndex[0] = 0; B.colIndex[0] = 1;
    B.values[1] = 7; B.rowIndex[1] = 1; B.colIndex[1] = 0;
    B.values[2] = 4; B.rowIndex[2] = 2; B.colIndex[2] = 2;
    B.values[3] = 9; B.rowIndex[3] = 2; B.colIndex[3] = 1;

    printf("Matrix A:\n");
    printSparseMatrix(&A);
    printf("\nMatrix B:\n");
    printSparseMatrix(&B);

    int resultRows = A.rows;
    int resultCols = B.cols;
    float* C = (float*)malloc(resultRows * resultCols * sizeof(float));

    clock_t start = clock();
    sparse_matmul_optimized(&A, &B, C, resultRows, resultCols);
    clock_t end = clock();

    printf("\nMatrix C (Result):\n");
    for (int i = 0; i < resultRows; i++) {
        for (int j = 0; j < resultCols; j++) {
            printf("%.2f ", C[i * resultCols + j]);
        }
        printf("\n");
    }

    double time_taken = (double)(end - start) / CLOCKS_PER_SEC;
    printf("\nSparse Matrix Multiplication Time: %f seconds\n", time_taken);

    free(A.values);
    free(A.rowIndex);
    free(A.colIndex);
    free(B.values);
    free(B.rowIndex);
    free(B.colIndex);
    free(C);

    return 0;
}