diff --git a/neon优化稀疏矩阵乘法 (3).c b/neon优化稀疏矩阵乘法 (3).c new file mode 100644 index 0000000..42229fc --- /dev/null +++ b/neon优化稀疏矩阵乘法 (3).c @@ -0,0 +1,171 @@ +#include +#include +#include +#include + +// Ԥ +#define ROWS 10 +#define COLS 10 + +// ϡṹ +typedef struct { + float *values; + int *rowIndex; + int *colIndex; + int nonZeroCount; +} SparseMatrix; + +// ϡתΪͨ +void sparseToDense(SparseMatrix *sparse, float denseMatrix[ROWS][COLS]) { + for (int i = 0; i < ROWS; i++) { + for (int j = 0; j < COLS; j++) { + denseMatrix[i][j] = 0; + } + } + + for (int k = 0; k < sparse->nonZeroCount; k++) { + int row = sparse->rowIndex[k]; + int col = sparse->colIndex[k]; + denseMatrix[row][col] = sparse->values[k]; + } +} + +// NEONŻľ˷ +void matmul_optimized(float A[ROWS][COLS], float B[ROWS][COLS], float C[ROWS][COLS]) { + for (int i = 0; i < ROWS; i++) { + for (int j = 0; j < COLS; j++) { + // ʼCĵǰԪΪ0 + C[i][j] = 0; + + // ۼӵĴʼΪ0 + float32x4_t vecC = vdupq_n_f32(0); + + for (int k = 0; k < ROWS; k += 4) { + // ؾAһе4ԪصĴ + float32x4_t vecA = vld1q_f32(&A[i][k]); + // ؾBһе4ԪصĴעҪתõ߼ʵǰȡԪأ + float32x4_t vecB = vld1q_f32(&B[k][j]); + + // ӦԪ˲ۼӵvecC + vecC = vmlaq_f32(vecC, vecA, vecB); + } + + // ۼӽĴȡۼӵC[i][j] + C[i][j] += vgetq_lane_f32(vecC, 0) + vgetq_lane_f32(vecC, 1) + vgetq_lane_f32(vecC, 2) + vgetq_lane_f32(vecC, 3); + } + } +} + +// ӡ +void printMatrix(float matrix[ROWS][COLS]) { + for (int i = 0; i < ROWS; i++) { + for (int j = 0; j < COLS; j++) { + printf("%f ", matrix[i][j]); + } + printf("\n"); + } +} + +int main() { + // ʼϡA + SparseMatrix sparseA; + sparseA.nonZeroCount = 5; + sparseA.values = (float *)malloc(sparseA.nonZeroCount * sizeof(float)); + sparseA.rowIndex = (int *)malloc(sparseA.nonZeroCount * sizeof(int)); + sparseA.colIndex = (int *)malloc(sparseA.nonZeroCount * sizeof(int)); + + if (sparseA.values == NULL || sparseA.rowIndex == NULL || sparseA.colIndex == NULL) { + free(sparseA.values); + free(sparseA.rowIndex); + free(sparseA.colIndex); + fprintf(stderr, "Memory allocation failed for sparse matrix A!\n"); + return 1; + } + + sparseA.values[0] = 1; + sparseA.values[1] = 2; + sparseA.values[2] = 3; + sparseA.values[3] = 4; + sparseA.values[4] = 5; + + sparseA.rowIndex[0] = 0; + sparseA.rowIndex[1] = 1; + sparseA.rowIndex[2] = 2; + sparseA.rowIndex[3] = 1; + sparseA.rowIndex[4] = 2; + + sparseA.colIndex[0] = 0; + sparseA.colIndex[1] = 1; + sparseA.colIndex[2] = 2; + sparseA.colIndex[3] = 0; + sparseA.colIndex[4] = 1; + + // ʼϡB + SparseMatrix sparseB; + sparseB.nonZeroCount = 4; + sparseB.values = (float *)malloc(sparseB.nonZeroCount * sizeof(float)); + sparseB.rowIndex = (int *)malloc(sparseB.nonZeroCount * sizeof(int)); + sparseB.colIndex = (int *)malloc(sparseB.nonZeroCount * sizeof(int)); + + if (sparseB.values == NULL || sparseB.rowIndex == NULL || sparseB.colIndex == NULL) { + free(sparseB.values); + free(sparseB.rowIndex); + free(sparseB.colIndex); + fprintf(stderr, "Memory allocation failed for sparse matrix B!\n"); + return 1; + } + + sparseB.values[0] = 2; + sparseB.values[1] = 3; + sparseB.values[2] = 4; + sparseB.values[3] = 5; + + sparseB.rowIndex[0] = 0; + sparseB.rowIndex[1] = 1; + sparseB.rowIndex[2] = 1; + sparseB.rowIndex[3] = 2; + + sparseB.colIndex[0] = 0; + sparseB.colIndex[1] = 1; + sparseB.colIndex[2] = 0; + sparseB.colIndex[3] = 1; + + // ʼ + float denseA[ROWS][COLS]; + float denseB[ROWS][COLS]; + float denseC[ROWS][COLS]; + + // ϡAתΪdenseA + sparseToDense(&sparseA, denseA); + + // ϡBתΪdenseB + sparseToDense(&sparseB, denseB); + + // ӡתijA + printf("matrixA:\n"); + printMatrix(denseA); + + // ӡתijB + printf("matrixB:\n"); + printMatrix(denseB); + + // ¼NEONŻľ˷ʱ + clock_t startMul, endMul; + startMul = clock(); + matmul_optimized(denseA, denseB, denseC); + endMul = clock(); + double time_taken_Mul = ((double)(endMul - startMul)) / CLOCKS_PER_SEC; + printf("NEON time: %lfs\n", time_taken_Mul); + + // ͷϡAڴ + free(sparseA.values); + free(sparseA.rowIndex); + free(sparseA.colIndex); + + // ͷϡBڴ + free(sparseB.values); + free(sparseB.rowIndex); + free(sparseB.colIndex); + + return 0; +}