diff --git a/6 b/6 new file mode 100644 index 0000000..167b45a --- /dev/null +++ b/6 @@ -0,0 +1,129 @@ +#include +#include +#include + +#include + +#define ROWS 1024 +#define COLS 1024 + + +void sparseToDense(float* values, int* rowIndex, int* colIndex, int nonZeroCount, float denseMatrix[ROWS][COLS]) { + + for (int i = 0; i < ROWS; i++) { + for (int j = 0; j < COLS; j++) { + denseMatrix[i][j] = 0; + } + } + + + for (int k = 0; k < nonZeroCount; k++) { + int row = rowIndex[k]; + int col = colIndex[k]; + float value = values[k]; + denseMatrix[row][col] = value; + } +} + + +void matmul_optimized(float** A, float** B, float** C, int n) { + for (int i = 0; i < n; i++) { + for (int j = 0; j < n; j++) { + float32x4_t vecC = vdupq_n_f32(0); + for (int k = 0; k < n; k += 4) { + + float32x4_t vecA = vld1q_f32(&A[i][k]); + + float32x4_t vecB = vld1q_f32(&B[k][j]); + + + vecC = vmlaq_f32(vecC, vecA, vecB); + } + + C[i][j] = vgetq_lane_f32(vecC, 0) + vgetq_lane_f32(vecC, 1) + + vgetq_lane_f32(vecC, 2) + vgetq_lane_f32(vecC, 3); + } + } +} + +int main() { + + float A_values[] = {1, 2, 3}; + int A_rowIndex[] = {0, 1, 2}; + int A_colIndex[] = {0, 1, 2}; + int A_nonZeroCount = 3; + + + float B_values[] = {4, 5, 6}; + int B_rowIndex[] = {0, 1, 2}; + int B_colIndex[] = {0, 1, 2}; + int B_nonZeroCount = 3; + + + float denseMatrixA[ROWS][COLS]; + + float denseMatrixB[ROWS][COLS]; + + float resultMatrix[ROWS][COLS]; + + + sparseToDense(A_values, A_rowIndex, A_colIndex, A_nonZeroCount, denseMatrixA); + + printf("常规矩阵A:\n"); + for (int i = 0; i < ROWS; i++) { + for (int j = 0; j < COLS; j++) { + printf("%f ", denseMatrixA[i][j]); + } + printf("\n"); + } + + + sparseToDense(B_values, B_rowIndex, B_colIndex, B_nonZeroCount, denseMatrixB); + + printf("常规矩阵B:\n"); + for (int i = 0; i < ROWS; i++) { + for (int j = 0; j < COLS; j++) { + printf("%f ", denseMatrixB[i][j]); + } + printf("\n"); + } + + + float* matrixAPtr[ROWS]; + for (int i = 0; i < ROWS; i++) { + matrixAPtr[i] = denseMatrixA[i]; + } + float* matrixBPtr[ROWS]; + for (int i = 0; i < ROWS; i++) { + matrixBPtr[i] = denseMatrixB[i]; + } + float* resultMatrixPtr[ROWS]; + for (int i = 0; i < ROWS; i++) { + resultMatrixPtr[i] = resultMatrix[i]; + } + + clock_t start_time, end_time; + + start_time = clock(); + + + matmul_optimized((float**)matrixAPtr, (float**)matrixBPtr, (float**)resultMatrixPtr, ROWS); + + + end_time = clock(); + + + double elapsed_time = ((double)(end_time - start_time)) / CLOCKS_PER_SEC; + printf("优化的稀疏矩阵乘法(使用NEON)的运行时间:%f 秒\n", elapsed_time); + + + printf("结果矩阵:\n"); + for (int i = 0; i < ROWS; i++) { + for (int j = 0; j < COLS; j++) { + printf("%f ", resultMatrix[i][j]); + } + printf("\n"); + } + + return 0; +} \ No newline at end of file