#include #include #include #include #define M 27 #define N 25 #define Q 12 //A(M*N),B(N*Q),C(M*Q),transposed(Q*N)对于矩阵维度的说明 void transposeMatrix(float** matrix, float** transposed) { for (int i = 0; i < N; i++) { for (int j = 0; j < Q; j++) { transposed[j][i] = matrix[i][j]; } } } void matmul_optimized(float** A, float** B, float** C) { for (int i = 0; i < M; i++) { for (int j = 0; j < Q; j++) { float sum = 0.0f; for (int k = 0; k < N; k += 4) { float32x4_t vecA, vecB, vecC; if (k + 4 <= N) { // 加载A和B的4个元素,进行向量化计算 vecA = vld1q_f32(&A[i][k]); vecB = vld1q_f32(&B[j][k]); // 向量化乘法并累加结果 vecC = vmulq_f32(vecA, vecB); sum += vgetq_lane_f32(vecC, 0) + vgetq_lane_f32(vecC, 1) + vgetq_lane_f32(vecC, 2) + vgetq_lane_f32(vecC, 3); } else { // 处理剩余的元素 for (int m = k; m < N; m++) { sum += A[i][m] * B[j][m]; } } } C[i][j] = sum; } } } int main() { //矩阵 A 的 COO 格式 float A_values[] = {1, 2, 3,4,5}; int A_rowIndex[] = {0, 0, 1, 2, 2}; int A_colIndex[] = {0, 2, 1,0, 2}; int A_nonZeroCount = 5; // 矩阵 B 的 COO 格式 float B_values[] = {6,8,7,9}; int B_rowIndex[] = {0,2, 1, 2}; int B_colIndex[] ={0,0,1, 2}; int B_nonZeroCount=4; //动态分配内存 float** denseMatrixA = (float**)malloc(M * sizeof(float*)); float** denseMatrixB = (float**)malloc(N * sizeof(float*)); float** C = (float**)malloc(M * sizeof(float*)); float** transposed = (float**)malloc(Q * sizeof(float*)); for (int i = 0; i < M; i++) { denseMatrixA[i] = (float*)malloc(N * sizeof(float)); C[i] = (float*)malloc(Q * sizeof(float)); } for (int i = 0; i < N; ++i) { denseMatrixB[i] = (float*)malloc(Q * sizeof(float)); } for (int i = 0; i < Q; ++i) { transposed[i] = (float*)malloc(N * sizeof(float)); } // 实现稀疏矩阵转换为普通矩阵 for(int i=0;i