#define _CRT_SECURE_NO_WARNINGS
#include <stdio.h>  
#include <stdlib.h>  
#include <arm_neon.h>  
#include <time.h>  

#define ROWS 4  
#define COLS 4  

typedef struct {
    float* values;
    int* rowIndex;
    int* colIndex;
    int nonZeroCount;
} SparseMatrix;

SparseMatrix* createSparseMatrix(int nonZeroCount) {
    SparseMatrix* matrix = (SparseMatrix*)malloc(sizeof(SparseMatrix));
    matrix->values = (float*)malloc(nonZeroCount * sizeof(float));
    matrix->rowIndex = (int*)malloc(nonZeroCount * sizeof(int));
    matrix->colIndex = (int*)malloc(nonZeroCount * sizeof(int));
    matrix->nonZeroCount = nonZeroCount;
    return matrix;
}
 
void freeSparseMatrix(SparseMatrix* matrix) {
    free(matrix->values);
    free(matrix->rowIndex);
    free(matrix->colIndex);
    free(matrix);
}
 
void sparseToDense(SparseMatrix* sparse, float dense[ROWS][COLS]) { 
    for (int i = 0; i < ROWS; i++) {
        for (int j = 0; j < COLS; j++) {
            dense[i][j] = 0.0f;
        }
    }
  
    for (int i = 0; i < sparse->nonZeroCount; i++) {
        int row = sparse->rowIndex[i];
        int col = sparse->colIndex[i];
        dense[row][col] = sparse->values[i];
    }
}
 
void printDenseMatrix(float dense[ROWS][COLS]) {
    for (int i = 0; i < ROWS; i++) {
        for (int j = 0; j < COLS; j++) {
            printf("%5.1f ", dense[i][j]);
        }
        printf("\n");
    }
}

SparseMatrix* sparse_matmul(SparseMatrix* A, SparseMatrix* B) {
    int maxNonZeroCount = A->nonZeroCount * B->nonZeroCount;
    SparseMatrix* C = createSparseMatrix(maxNonZeroCount);
    int count = 0;
 
    for (int i = 0; i < A->nonZeroCount; i++) {
        float aValue = A->values[i];
        int aRow = A->rowIndex[i];
        int aCol = A->colIndex[i];
 
        for (int j = 0; j < B->nonZeroCount; j++) {
            if (aCol == B->rowIndex[j]) {   
                float bValue = B->values[j];
                int bCol = B->colIndex[j];

                int found = 0;
                for (int k = 0; k < count; k++) {
                    if (C->rowIndex[k] == aRow && C->colIndex[k] == bCol) {
                        C->values[k] += aValue * bValue; 
                        found = 1;
                        break;
                    }
                }
                if (!found) {
                    C->values[count] = aValue * bValue;
                    C->rowIndex[count] = aRow;
                    C->colIndex[count] = bCol;
                    count++;
                }
            }
        }
    }

    C->nonZeroCount = count;

    C->values = (float*)realloc(C->values, count * sizeof(float));
    C->rowIndex = (int*)realloc(C->rowIndex, count * sizeof(int));
    C->colIndex = (int*)realloc(C->colIndex, count * sizeof(int));

    return C;
}

void neonSparseMatMul(SparseMatrix* A, SparseMatrix* B, float C[ROWS][COLS]) { 
    for (int i = 0; i < ROWS; i++) {
        for (int j = 0; j < COLS; j++) {
            C[i][j] = 0.0f;
        }
    }
 
    for (int i = 0; i < A->nonZeroCount; i++) {
        float aValue = A->values[i];
        int aRow = A->rowIndex[i];
        int aCol = A->colIndex[i];

        for (int j = 0; j < B->nonZeroCount; j++) {
            if (aCol == B->rowIndex[j]) {
                float bValue = B->values[j];
                int bCol = B->colIndex[j];

                float32x4_t cValue = vld1q_f32(&C[aRow][bCol]); 
                cValue = vmlaq_n_f32(cValue, vdupq_n_f32(aValue), bValue); 
                vst1q_f32(&C[aRow][bCol], cValue); 
            }
        }
    }
}

int main() {
    SparseMatrix* A = createSparseMatrix(4);
    A->values[0] = 1.0; A->rowIndex[0] = 0; A->colIndex[0] = 0;
    A->values[1] = 2.0; A->rowIndex[1] = 0; A->colIndex[1] = 2;
    A->values[2] = 3.0; A->rowIndex[2] = 1; A->colIndex[2] = 1;
    A->values[3] = 4.0; A->rowIndex[3] = 2; A->colIndex[3] = 0;
 
    SparseMatrix* B = createSparseMatrix(4);
    B->values[0] = 5.0; B->rowIndex[0] = 0; B->colIndex[0] = 1;
    B->values[1] = 6.0; B->rowIndex[1] = 1; B->colIndex[1] = 2;
    B->values[2] = 7.0; B->rowIndex[2] = 2; B->colIndex[2] = 0;
    B->values[3] = 8.0; B->rowIndex[3] = 2; B->colIndex[3] = 1;
 
    float C[ROWS][COLS];
 
    clock_t start = clock();

    neonSparseMatMul(A, B, C);
 
    clock_t end = clock();
    double time_taken = (double)(end - start) / CLOCKS_PER_SEC;  

    float denseA[ROWS][COLS], denseB[ROWS][COLS];
    sparseToDense(A, denseA);
    sparseToDense(B, denseB);

    printf("普通矩阵 A:\n");
    printDenseMatrix(denseA);
    printf("普通矩阵 B:\n");
    printDenseMatrix(denseB);
    
    printf("稀疏矩阵乘法结果:\n");
    printDenseMatrix(C);
    printf("稀疏矩阵乘法运行时间: %f 秒\n", time_taken);

    freeSparseMatrix(A);
    freeSparseMatrix(B);

    return 0;
}