ADD file via upload

10 months ago · 666a5397c4
parent 339ac48183
commit 666a5397c4
1 changed files with 171 additions and 0 deletions
--- a/(3).c
+++ b/(3).c
@ -0,0 +1,171 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <arm_neon.h>
+
+// 预定义矩阵的行数和列数
+#define ROWS 10
+#define COLS 10
+
+// 稀疏矩阵结构体
+typedef struct {
+    float *values;
+    int *rowIndex;
+    int *colIndex;
+    int nonZeroCount;
+} SparseMatrix;
+
+// 将稀疏矩阵转换为普通矩阵
+void sparseToDense(SparseMatrix *sparse, float denseMatrix[ROWS][COLS]) {
+    for (int i = 0; i < ROWS; i++) {
+        for (int j = 0; j < COLS; j++) {
+            denseMatrix[i][j] = 0;
+        }
+    }
+
+    for (int k = 0; k < sparse->nonZeroCount; k++) {
+        int row = sparse->rowIndex[k];
+        int col = sparse->colIndex[k];
+        denseMatrix[row][col] = sparse->values[k];
+    }
+}
+
+// NEON优化的矩阵乘法函数
+void matmul_optimized(float A[ROWS][COLS], float B[ROWS][COLS], float C[ROWS][COLS]) {
+    for (int i = 0; i < ROWS; i++) {
+        for (int j = 0; j < COLS; j++) {
+            // 初始化结果矩阵C的当前元素为0
+            C[i][j] = 0;
+
+            // 用于累加的向量寄存器，初始化为0
+            float32x4_t vecC = vdupq_n_f32(0);
+
+            for (int k = 0; k < ROWS; k += 4) {
+                // 加载矩阵A的一行中的4个连续元素到向量寄存器
+                float32x4_t vecA = vld1q_f32(&A[i][k]);
+                // 加载矩阵B的一列中的4个连续元素到向量寄存器（注意这里要转置的逻辑，实际是按列取元素）
+                float32x4_t vecB = vld1q_f32(&B[k][j]);
+
+                // 对应元素相乘并累加到vecC
+                vecC = vmlaq_f32(vecC, vecA, vecB);
+            }
+
+            // 将累加结果从向量寄存器提取并累加到C[i][j]
+            C[i][j] += vgetq_lane_f32(vecC, 0) + vgetq_lane_f32(vecC, 1) + vgetq_lane_f32(vecC, 2) + vgetq_lane_f32(vecC, 3);
+        }
+    }
+}
+
+// 打印矩阵
+void printMatrix(float matrix[ROWS][COLS]) {
+    for (int i = 0; i < ROWS; i++) {
+        for (int j = 0; j < COLS; j++) {
+            printf("%f ", matrix[i][j]);
+        }
+        printf("\n");
+    }
+}
+
+int main() {
+    // 初始化稀疏矩阵A
+    SparseMatrix sparseA;
+    sparseA.nonZeroCount = 5;
+    sparseA.values = (float *)malloc(sparseA.nonZeroCount * sizeof(float));
+    sparseA.rowIndex = (int *)malloc(sparseA.nonZeroCount * sizeof(int));
+    sparseA.colIndex = (int *)malloc(sparseA.nonZeroCount * sizeof(int));
+
+    if (sparseA.values == NULL || sparseA.rowIndex == NULL || sparseA.colIndex == NULL) {
+        free(sparseA.values);
+        free(sparseA.rowIndex);
+        free(sparseA.colIndex);
+        fprintf(stderr, "Memory allocation failed for sparse matrix A!\n");
+        return 1;
+    }
+
+    sparseA.values[0] = 1;
+    sparseA.values[1] = 2;
+    sparseA.values[2] = 3;
+    sparseA.values[3] = 4;
+    sparseA.values[4] = 5;
+
+    sparseA.rowIndex[0] = 0;
+    sparseA.rowIndex[1] = 1;
+    sparseA.rowIndex[2] = 2;
+    sparseA.rowIndex[3] = 1;
+    sparseA.rowIndex[4] = 2;
+
+    sparseA.colIndex[0] = 0;
+    sparseA.colIndex[1] = 1;
+    sparseA.colIndex[2] = 2;
+    sparseA.colIndex[3] = 0;
+    sparseA.colIndex[4] = 1;
+
+    // 初始化稀疏矩阵B
+    SparseMatrix sparseB;
+    sparseB.nonZeroCount = 4;
+    sparseB.values = (float *)malloc(sparseB.nonZeroCount * sizeof(float));
+    sparseB.rowIndex = (int *)malloc(sparseB.nonZeroCount * sizeof(int));
+    sparseB.colIndex = (int *)malloc(sparseB.nonZeroCount * sizeof(int));
+
+    if (sparseB.values == NULL || sparseB.rowIndex == NULL || sparseB.colIndex == NULL) {
+        free(sparseB.values);
+        free(sparseB.rowIndex);
+        free(sparseB.colIndex);
+        fprintf(stderr, "Memory allocation failed for sparse matrix B!\n");
+        return 1;
+    }
+
+    sparseB.values[0] = 2;
+    sparseB.values[1] = 3;
+    sparseB.values[2] = 4;
+    sparseB.values[3] = 5;
+
+    sparseB.rowIndex[0] = 0;
+    sparseB.rowIndex[1] = 1;
+    sparseB.rowIndex[2] = 1;
+    sparseB.rowIndex[3] = 2;
+
+    sparseB.colIndex[0] = 0;
+    sparseB.colIndex[1] = 1;
+    sparseB.colIndex[2] = 0;
+    sparseB.colIndex[3] = 1;
+
+    // 声明并初始化常规矩阵
+    float denseA[ROWS][COLS];
+    float denseB[ROWS][COLS];
+    float denseC[ROWS][COLS];
+
+    // 将稀疏矩阵A转换为常规矩阵denseA
+    sparseToDense(&sparseA, denseA);
+
+    // 将稀疏矩阵B转换为常规矩阵denseB
+    sparseToDense(&sparseB, denseB);
+
+    // 打印转换后的常规矩阵A
+    printf("matrixA:\n");
+    printMatrix(denseA);
+
+    // 打印转换后的常规矩阵B
+    printf("matrixB:\n");
+    printMatrix(denseB);
+
+    // 记录NEON优化的矩阵乘法运行时间
+    clock_t startMul, endMul;
+    startMul = clock();
+    matmul_optimized(denseA, denseB, denseC);
+    endMul = clock();
+    double time_taken_Mul = ((double)(endMul - startMul)) / CLOCKS_PER_SEC;
+    printf("NEON time: %lfs\n", time_taken_Mul);
+    
+    // 释放稀疏矩阵A的内存
+    free(sparseA.values);
+    free(sparseA.rowIndex);
+    free(sparseA.colIndex);
+
+    // 释放稀疏矩阵B的内存
+    free(sparseB.values);
+    free(sparseB.rowIndex);
+    free(sparseB.colIndex);
+
+    return 0;
+}