ADD file via upload

12 months ago · fc89ff89cf
parent 4b3629f775
commit fc89ff89cf
1 changed files with 236 additions and 0 deletions
--- a/opoptimize.c
+++ b/opoptimize.c
@ -0,0 +1,236 @@
+#include <stdio.h>
+#include <time.h>
+#include <stdlib.h>
+#include <arm_neon.h>
+
+#define ROW 4
+#define COL 4
+#define MAX 16
+
+typedef void (*vector_add_func)(float* , float* , float* , int );
+typedef void (*matmul_func)(float** ,float** ,float** ,int );
+typedef void (*spare_matmul_func)(float*, int*, int*, int, float*, int*, int*, int, float*, int*, int*, int*);
+
+void test_vector_add(vector_add_func func,const char * attributive){
+    int size=1024;
+    float *A = malloc(size * sizeof(float ));
+    float *B = malloc(size * sizeof(float ));
+    float *C = malloc(size * sizeof(float ));
+
+    for (int i=0;i<size;i++) {
+        A[i]=rand()%100;
+        B[i]=rand()%100;
+    }
+
+    clock_t start = clock();
+    func(A,B,C,size);
+    clock_t end = clock();
+    printf("%s向量加法耗时%lf秒\n",attributive,(double)(end-start)/CLOCKS_PER_SEC);
+
+    free(A);
+    free(B);
+    free(C);
+}
+
+void vector_add(float* A, float* B, float* C, int size) {
+    for (int i = 0;i< size;++i){
+        //加载A和B向量的4个浮点数到NEON寄存器
+        C[i]=A[i]+B[i];
+    }
+}
+
+void vector_add_optimized(float* A, float* B, float* C, int size) {
+    for (int i = 0;i< size; i+= 4){
+        //加载A和B向量的4个浮点数到NEON寄存器
+        float32x4_t vecA = vld1q_f32(&A[i]);
+        float32x4_t vecB = vld1q_f32(&B[i]);
+
+        float32x4_t vecC =vaddq_f32(vecA,vecB);
+        //将结果存储到c向量
+        vst1q_f32(&C[i], vecC);
+
+    }
+
+}
+
+void test_matmul(matmul_func func,const char * attributive) {
+    const int n=1024;
+    float **A = malloc(n * sizeof(float *));
+    float **B = malloc(n * sizeof(float *));
+    float **C = malloc(n * sizeof(float *));
+    for (int i = 0; i< n; ++i) {
+        A[i] = malloc(n * sizeof(float *));
+        B[i] = malloc(n * sizeof(float *));
+        C[i] = malloc(n * sizeof(float *));
+    }
+
+    for (int i=0;i<n;i++) {
+        for (int j=0;j<n;j++) {
+            A[i][j]=rand()%100;
+        }
+    }
+    clock_t start = clock();
+    func(A,B,C,n);
+    clock_t end = clock();
+    printf("%s稠密向量乘法耗时%lf秒\n",attributive,(double)(end-start)/CLOCKS_PER_SEC);
+
+    for (int i = 0; i< n; ++i) {
+        free(A[i]);
+        free(B[i]);
+        free(C[i]);
+    }
+
+    free(A);
+    free(B);
+    free(C);
+}
+
+void matmul(float** A,float** B,float** C,int n){
+    for (int i = 0; i< n;++i){
+        for (int j = 0; j< n; ++j){
+            C[i][j] =0;
+            for (int k = 0; k< n; ++k) {
+                C[i][j] += A[i][k] * B[k][j];
+            }
+        }
+    }
+}
+
+void matmul_optimized(float** A,float** B,float** C,int n){
+    //疑似还要对B矩阵转置
+    for (int i = 0; i< n;++i){
+        for (int j = 0; j< n; ++j){
+            float32x4_t vecC=vdupq_n_f32(0.0);
+            for (int k = 0; k< n; k+=4) {
+                float32x4_t vecA = vld1q_f32(&A[i][k]);
+                float32x4_t vecB = vld1q_f32(&B[k][j]);
+                vecC = vmlaq_f32(vecC, vecA, vecB);
+            }
+            C[i][j] = vgetq_lane_f32(vecC, 0) + vgetq_lane_f32(vecC, 1) + vgetq_lane_f32(vecC, 2) + vgetq_lane_f32(vecC, 3);
+        }
+    }
+}
+
+void test_sparse_matmul(spare_matmul_func func,const char * attributive) {
+
+    float A_values[] = {1, 2, 3, 4, 5};
+    int A_rowIndex[] = {0, 0,1, 2, 2};
+    int A_colIndex[] = {0, 2, 1, 0, 2};
+    int A_nonZeroCount = 5;
+    //矩阵B的COO格式
+    float B_values[] = {6, 8, 7, 9};
+    int B_rowIndex[] = {0,2, 1, 2};
+    int B_colIndex[] = {0, 0, 1, 2};
+    int B_nonZeroCount = 4;
+    //结果矩阵C的coo格式
+    float C_values[MAX];
+    int C_rowIndex[MAX];
+    int C_colIndex[MAX];
+    int C_nonZeroCount = 0;
+
+    clock_t start = clock();
+    func(A_values,A_rowIndex,A_colIndex,A_nonZeroCount,B_values,B_rowIndex,B_colIndex,B_nonZeroCount,C_values,C_rowIndex,C_colIndex,&C_nonZeroCount);
+    clock_t end = clock();
+    printf("%s稀疏向量乘法耗时%lf秒\n",attributive,(double)(end-start)/CLOCKS_PER_SEC);
+}
+void sparse_matmul_coo( float* A_values, int* A_rowIndex, int* A_colIndex, int A_nonZeroCount,
+                        float* B_values,int* B_rowIndex,int* B_colIndex, int B_nonZeroCount,
+                        float* C_values, int* C_rowIndex, int* C_colIndex, int* C_nonZeroCount) {
+    int currentIndex = 0;
+    //遍历A的非零元素
+    for (int i = 0; i<A_nonZeroCount; i++) {
+        int colA = A_colIndex[i];int rowA = A_rowIndex[i];
+        float valueA = A_values[i];
+        //遍历B的非零元素
+        for (int j=0;j<B_nonZeroCount;j++) {
+            int rowB = B_rowIndex[j];
+            int colB = B_colIndex[j];
+            float valueB = B_values[j];
+            //如果A的列和B的行匹配，则计算乘积并存储结果
+            if (colA == rowB) {
+                float product = valueA * valueB;
+                //检查是否已有此（rowA，colB）项
+                int found = 0;
+                for (int k = 0;k< currentIndex; k++) {
+                    if (C_rowIndex[k] == rowA && C_colIndex[k] == colB){
+                        C_values[k] += product;
+                        found = 1;
+                        break;
+                    }
+                }
+                if (!found){
+                    //添加新的非零元素
+                    C_values[currentIndex] = product;
+                    C_rowIndex[currentIndex] = rowA;
+                    C_colIndex[currentIndex] = colB;
+                    currentIndex++;
+                }
+                //更新非零元素数量
+            }
+        }
+    }
+    *C_nonZeroCount =currentIndex;
+}
+
+void sparse_matmul_coo_optimized( float* A_values, int* A_rowIndex, int* A_colIndex, int A_nonZeroCount,
+                        float* B_values,int* B_rowIndex,int* B_colIndex, int B_nonZeroCount,
+                        float* C_values, int* C_rowIndex, int* C_colIndex, int* C_nonZeroCount) {
+
+    const int n=4;
+    float **A = malloc(n * sizeof(float *));
+    float **B = malloc(n * sizeof(float *));
+    float **C = malloc(n * sizeof(float *));
+    for (int i = 0; i< n; ++i) {
+        A[i] = malloc(n * sizeof(float *));
+        B[i] = malloc(n * sizeof(float *));
+        C[i] = malloc(n * sizeof(float *));
+    }
+
+    for (int i = 0; i < A_nonZeroCount; i++) {
+        int row = A_rowIndex[i];
+        int col = A_colIndex[i];
+        A[row][col] = A_values[i];
+    }
+
+    for (int i = 0; i < B_nonZeroCount; i++) {
+        int row = B_rowIndex[i];
+        int col = B_colIndex[i];
+        B[row][col] = B_values[i];
+    }
+
+
+    matmul_optimized(A,B,C,n);
+
+    for (int i=0;i<n;i++) {
+        for (int j=0;j<n;j++) {
+            if (C[i][j]!=0) {
+                *C_nonZeroCount++;
+                *C_values=C[i][j];C_values++;
+                *C_colIndex=i;C_colIndex++;
+                *C_rowIndex=i;C_rowIndex++;
+            }
+        }
+    }
+    for (int i = 0; i< n; ++i) {
+        free(A[i]);
+        free(B[i]);
+        free(C[i]);
+    }
+
+    free(A);
+    free(B);
+    free(C);
+}
+
+int main(){
+    test_vector_add(vector_add,"正常");
+    test_vector_add(vector_add_optimized,"优化");
+
+    test_matmul(matmul,"正常");
+    test_matmul(matmul_optimized,"优化");
+
+    test_sparse_matmul(sparse_matmul_coo,"正常");
+    test_sparse_matmul(sparse_matmul_coo_optimized,"优化");
+
+    return 0;
+}