operator_optimization/opoptimize.c

#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <arm_neon.h>

#define ROW 4
#define COL 4
#define MAX 16

typedef void (*vector_add_func)(float* , float* , float* , int );
typedef void (*matmul_func)(float** ,float** ,float** ,int );
typedef void (*spare_matmul_func)(float*, int*, int*, int, float*, int*, int*, int, float*, int*, int*, int*);

void test_vector_add(vector_add_func func,const char * attributive){
    int size=1024;
    float *A = malloc(size * sizeof(float ));
    float *B = malloc(size * sizeof(float ));
    float *C = malloc(size * sizeof(float ));

    for (int i=0;i<size;i++) {
        A[i]=rand()%100;
        B[i]=rand()%100;
    }

    clock_t start = clock();
    func(A,B,C,size);
    clock_t end = clock();
    printf("%s向量加法耗时%lf秒\n",attributive,(double)(end-start)/CLOCKS_PER_SEC);

    free(A);
    free(B);
    free(C);
}

void vector_add(float* A, float* B, float* C, int size) {
    for (int i = 0;i< size;++i){
        //加载A和B向量的4个浮点数到NEON寄存器
        C[i]=A[i]+B[i];
    }
}

void vector_add_optimized(float* A, float* B, float* C, int size) {
    for (int i = 0;i< size; i+= 4){
        //加载A和B向量的4个浮点数到NEON寄存器
        float32x4_t vecA = vld1q_f32(&A[i]);
        float32x4_t vecB = vld1q_f32(&B[i]);

        float32x4_t vecC =vaddq_f32(vecA,vecB);
        //将结果存储到c向量
        vst1q_f32(&C[i], vecC);

    }

}

void test_matmul(matmul_func func,const char * attributive) {
    const int n=1024;
    float **A = malloc(n * sizeof(float *));
    float **B = malloc(n * sizeof(float *));
    float **C = malloc(n * sizeof(float *));
    for (int i = 0; i< n; ++i) {
        A[i] = malloc(n * sizeof(float *));
        B[i] = malloc(n * sizeof(float *));
        C[i] = malloc(n * sizeof(float *));
    }

    for (int i=0;i<n;i++) {
        for (int j=0;j<n;j++) {
            A[i][j]=rand()%100;
        }
    }
    clock_t start = clock();
    func(A,B,C,n);
    clock_t end = clock();
    printf("%s稠密向量乘法耗时%lf秒\n",attributive,(double)(end-start)/CLOCKS_PER_SEC);

    for (int i = 0; i< n; ++i) {
        free(A[i]);
        free(B[i]);
        free(C[i]);
    }

    free(A);
    free(B);
    free(C);
}

void matmul(float** A,float** B,float** C,int n){
    for (int i = 0; i< n;++i){
        for (int j = 0; j< n; ++j){
            C[i][j] =0;
            for (int k = 0; k< n; ++k) {
                C[i][j] += A[i][k] * B[k][j];
            }
        }
    }
}

void matmul_optimized(float** A,float** B,float** C,int n){
    //疑似还要对B矩阵转置
    for (int i = 0; i< n;++i){
        for (int j = 0; j< n; ++j){
            float32x4_t vecC=vdupq_n_f32(0.0);
            for (int k = 0; k< n; k+=4) {
                float32x4_t vecA = vld1q_f32(&A[i][k]);
                float32x4_t vecB = vld1q_f32(&B[k][j]);
                vecC = vmlaq_f32(vecC, vecA, vecB);
            }
            C[i][j] = vgetq_lane_f32(vecC, 0) + vgetq_lane_f32(vecC, 1) + vgetq_lane_f32(vecC, 2) + vgetq_lane_f32(vecC, 3);
        }
    }
}

void test_sparse_matmul(spare_matmul_func func,const char * attributive) {

    float A_values[] = {1, 2, 3, 4, 5};
    int A_rowIndex[] = {0, 0,1, 2, 2};
    int A_colIndex[] = {0, 2, 1, 0, 2};
    int A_nonZeroCount = 5;
    //矩阵B的COO格式
    float B_values[] = {6, 8, 7, 9};
    int B_rowIndex[] = {0,2, 1, 2};
    int B_colIndex[] = {0, 0, 1, 2};
    int B_nonZeroCount = 4;
    //结果矩阵C的coo格式
    float C_values[MAX];
    int C_rowIndex[MAX];
    int C_colIndex[MAX];
    int C_nonZeroCount = 0;

    clock_t start = clock();
    func(A_values,A_rowIndex,A_colIndex,A_nonZeroCount,B_values,B_rowIndex,B_colIndex,B_nonZeroCount,C_values,C_rowIndex,C_colIndex,&C_nonZeroCount);
    clock_t end = clock();
    printf("%s稀疏向量乘法耗时%lf秒\n",attributive,(double)(end-start)/CLOCKS_PER_SEC);
}
void sparse_matmul_coo( float* A_values, int* A_rowIndex, int* A_colIndex, int A_nonZeroCount,
                        float* B_values,int* B_rowIndex,int* B_colIndex, int B_nonZeroCount,
                        float* C_values, int* C_rowIndex, int* C_colIndex, int* C_nonZeroCount) {
    int currentIndex = 0;
    //遍历A的非零元素
    for (int i = 0; i<A_nonZeroCount; i++) {
        int colA = A_colIndex[i];int rowA = A_rowIndex[i];
        float valueA = A_values[i];
        //遍历B的非零元素
        for (int j=0;j<B_nonZeroCount;j++) {
            int rowB = B_rowIndex[j];
            int colB = B_colIndex[j];
            float valueB = B_values[j];
            //如果A的列和B的行匹配，则计算乘积并存储结果
            if (colA == rowB) {
                float product = valueA * valueB;
                //检查是否已有此（rowA，colB）项
                int found = 0;
                for (int k = 0;k< currentIndex; k++) {
                    if (C_rowIndex[k] == rowA && C_colIndex[k] == colB){
                        C_values[k] += product;
                        found = 1;
                        break;
                    }
                }
                if (!found){
                    //添加新的非零元素
                    C_values[currentIndex] = product;
                    C_rowIndex[currentIndex] = rowA;
                    C_colIndex[currentIndex] = colB;
                    currentIndex++;
                }
                //更新非零元素数量
            }
        }
    }
    *C_nonZeroCount =currentIndex;
}

void sparse_matmul_coo_optimized( float* A_values, int* A_rowIndex, int* A_colIndex, int A_nonZeroCount,
                        float* B_values,int* B_rowIndex,int* B_colIndex, int B_nonZeroCount,
                        float* C_values, int* C_rowIndex, int* C_colIndex, int* C_nonZeroCount) {

    const int n=4;
    float **A = malloc(n * sizeof(float *));
    float **B = malloc(n * sizeof(float *));
    float **C = malloc(n * sizeof(float *));
    for (int i = 0; i< n; ++i) {
        A[i] = malloc(n * sizeof(float *));
        B[i] = malloc(n * sizeof(float *));
        C[i] = malloc(n * sizeof(float *));
    }

    for (int i = 0; i < A_nonZeroCount; i++) {
        int row = A_rowIndex[i];
        int col = A_colIndex[i];
        A[row][col] = A_values[i];
    }

    for (int i = 0; i < B_nonZeroCount; i++) {
        int row = B_rowIndex[i];
        int col = B_colIndex[i];
        B[row][col] = B_values[i];
    }


    matmul_optimized(A,B,C,n);

    for (int i=0;i<n;i++) {
        for (int j=0;j<n;j++) {
            if (C[i][j]!=0) {
                *C_nonZeroCount++;
                *C_values=C[i][j];C_values++;
                *C_colIndex=i;C_colIndex++;
                *C_rowIndex=i;C_rowIndex++;
            }
        }
    }
    for (int i = 0; i< n; ++i) {
        free(A[i]);
        free(B[i]);
        free(C[i]);
    }

    free(A);
    free(B);
    free(C);
}

int main(){
    test_vector_add(vector_add,"正常");
    test_vector_add(vector_add_optimized,"优化");

    test_matmul(matmul,"正常");
    test_matmul(matmul_optimized,"优化");

    test_sparse_matmul(sparse_matmul_coo,"正常");
    test_sparse_matmul(sparse_matmul_coo_optimized,"优化");

    return 0;
}