|  |  | @ -0,0 +1,236 @@ | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | #include <stdio.h> | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | #include <time.h> | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | #include <stdlib.h> | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | #include <arm_neon.h> | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | 
 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | #define ROW 4 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | #define COL 4 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | #define MAX 16 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | 
 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | typedef void (*vector_add_func)(float* , float* , float* , int ); | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | typedef void (*matmul_func)(float** ,float** ,float** ,int ); | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | typedef void (*spare_matmul_func)(float*, int*, int*, int, float*, int*, int*, int, float*, int*, int*, int*); | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | 
 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | void test_vector_add(vector_add_func func,const char * attributive){ | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     int size=1024; | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     float *A = malloc(size * sizeof(float )); | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     float *B = malloc(size * sizeof(float )); | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     float *C = malloc(size * sizeof(float )); | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | 
 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     for (int i=0;i<size;i++) { | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |         A[i]=rand()%100; | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |         B[i]=rand()%100; | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     } | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | 
 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     clock_t start = clock(); | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     func(A,B,C,size); | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     clock_t end = clock(); | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     printf("%s向量加法耗时%lf秒\n",attributive,(double)(end-start)/CLOCKS_PER_SEC); | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | 
 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     free(A); | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     free(B); | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     free(C); | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | } | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | 
 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | void vector_add(float* A, float* B, float* C, int size) { | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     for (int i = 0;i< size;++i){ | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |         //加载A和B向量的4个浮点数到NEON寄存器
 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |         C[i]=A[i]+B[i]; | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     } | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | } | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | 
 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | void vector_add_optimized(float* A, float* B, float* C, int size) { | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     for (int i = 0;i< size; i+= 4){ | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |         //加载A和B向量的4个浮点数到NEON寄存器
 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |         float32x4_t vecA = vld1q_f32(&A[i]); | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |         float32x4_t vecB = vld1q_f32(&B[i]); | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | 
 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |         float32x4_t vecC =vaddq_f32(vecA,vecB); | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |         //将结果存储到c向量
 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |         vst1q_f32(&C[i], vecC); | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | 
 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     } | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | 
 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | } | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | 
 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | void test_matmul(matmul_func func,const char * attributive) { | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     const int n=1024; | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     float **A = malloc(n * sizeof(float *)); | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     float **B = malloc(n * sizeof(float *)); | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     float **C = malloc(n * sizeof(float *)); | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     for (int i = 0; i< n; ++i) { | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |         A[i] = malloc(n * sizeof(float *)); | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |         B[i] = malloc(n * sizeof(float *)); | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |         C[i] = malloc(n * sizeof(float *)); | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     } | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | 
 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     for (int i=0;i<n;i++) { | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |         for (int j=0;j<n;j++) { | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |             A[i][j]=rand()%100; | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |         } | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     } | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     clock_t start = clock(); | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     func(A,B,C,n); | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     clock_t end = clock(); | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     printf("%s稠密向量乘法耗时%lf秒\n",attributive,(double)(end-start)/CLOCKS_PER_SEC); | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | 
 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     for (int i = 0; i< n; ++i) { | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |         free(A[i]); | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |         free(B[i]); | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |         free(C[i]); | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     } | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | 
 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     free(A); | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     free(B); | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     free(C); | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | } | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | 
 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | void matmul(float** A,float** B,float** C,int n){ | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     for (int i = 0; i< n;++i){ | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |         for (int j = 0; j< n; ++j){ | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |             C[i][j] =0; | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |             for (int k = 0; k< n; ++k) { | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |                 C[i][j] += A[i][k] * B[k][j]; | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |             } | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |         } | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     } | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | } | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | 
 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | void matmul_optimized(float** A,float** B,float** C,int n){ | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     //疑似还要对B矩阵转置
 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     for (int i = 0; i< n;++i){ | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |         for (int j = 0; j< n; ++j){ | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |             float32x4_t vecC=vdupq_n_f32(0.0); | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |             for (int k = 0; k< n; k+=4) { | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |                 float32x4_t vecA = vld1q_f32(&A[i][k]); | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |                 float32x4_t vecB = vld1q_f32(&B[k][j]); | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |                 vecC = vmlaq_f32(vecC, vecA, vecB); | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |             } | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |             C[i][j] = vgetq_lane_f32(vecC, 0) + vgetq_lane_f32(vecC, 1) + vgetq_lane_f32(vecC, 2) + vgetq_lane_f32(vecC, 3); | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |         } | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     } | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | } | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | 
 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | void test_sparse_matmul(spare_matmul_func func,const char * attributive) { | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | 
 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     float A_values[] = {1, 2, 3, 4, 5}; | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     int A_rowIndex[] = {0, 0,1, 2, 2}; | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     int A_colIndex[] = {0, 2, 1, 0, 2}; | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     int A_nonZeroCount = 5; | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     //矩阵B的COO格式
 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     float B_values[] = {6, 8, 7, 9}; | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     int B_rowIndex[] = {0,2, 1, 2}; | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     int B_colIndex[] = {0, 0, 1, 2}; | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     int B_nonZeroCount = 4; | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     //结果矩阵C的coo格式
 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     float C_values[MAX]; | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     int C_rowIndex[MAX]; | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     int C_colIndex[MAX]; | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     int C_nonZeroCount = 0; | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | 
 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     clock_t start = clock(); | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     func(A_values,A_rowIndex,A_colIndex,A_nonZeroCount,B_values,B_rowIndex,B_colIndex,B_nonZeroCount,C_values,C_rowIndex,C_colIndex,&C_nonZeroCount); | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     clock_t end = clock(); | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     printf("%s稀疏向量乘法耗时%lf秒\n",attributive,(double)(end-start)/CLOCKS_PER_SEC); | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | } | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | void sparse_matmul_coo( float* A_values, int* A_rowIndex, int* A_colIndex, int A_nonZeroCount, | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |                         float* B_values,int* B_rowIndex,int* B_colIndex, int B_nonZeroCount, | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |                         float* C_values, int* C_rowIndex, int* C_colIndex, int* C_nonZeroCount) { | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     int currentIndex = 0; | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     //遍历A的非零元素
 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     for (int i = 0; i<A_nonZeroCount; i++) { | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |         int colA = A_colIndex[i];int rowA = A_rowIndex[i]; | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |         float valueA = A_values[i]; | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |         //遍历B的非零元素
 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |         for (int j=0;j<B_nonZeroCount;j++) { | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |             int rowB = B_rowIndex[j]; | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |             int colB = B_colIndex[j]; | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |             float valueB = B_values[j]; | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |             //如果A的列和B的行匹配,则计算乘积并存储结果
 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |             if (colA == rowB) { | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |                 float product = valueA * valueB; | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |                 //检查是否已有此(rowA,colB)项
 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |                 int found = 0; | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |                 for (int k = 0;k< currentIndex; k++) { | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |                     if (C_rowIndex[k] == rowA && C_colIndex[k] == colB){ | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |                         C_values[k] += product; | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |                         found = 1; | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |                         break; | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |                     } | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |                 } | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |                 if (!found){ | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |                     //添加新的非零元素
 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |                     C_values[currentIndex] = product; | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |                     C_rowIndex[currentIndex] = rowA; | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |                     C_colIndex[currentIndex] = colB; | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |                     currentIndex++; | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |                 } | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |                 //更新非零元素数量
 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |             } | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |         } | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     } | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     *C_nonZeroCount =currentIndex; | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | } | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | 
 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | void sparse_matmul_coo_optimized( float* A_values, int* A_rowIndex, int* A_colIndex, int A_nonZeroCount, | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |                         float* B_values,int* B_rowIndex,int* B_colIndex, int B_nonZeroCount, | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |                         float* C_values, int* C_rowIndex, int* C_colIndex, int* C_nonZeroCount) { | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | 
 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     const int n=4; | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     float **A = malloc(n * sizeof(float *)); | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     float **B = malloc(n * sizeof(float *)); | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     float **C = malloc(n * sizeof(float *)); | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     for (int i = 0; i< n; ++i) { | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |         A[i] = malloc(n * sizeof(float *)); | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |         B[i] = malloc(n * sizeof(float *)); | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |         C[i] = malloc(n * sizeof(float *)); | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     } | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | 
 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     for (int i = 0; i < A_nonZeroCount; i++) { | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |         int row = A_rowIndex[i]; | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |         int col = A_colIndex[i]; | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |         A[row][col] = A_values[i]; | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     } | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | 
 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     for (int i = 0; i < B_nonZeroCount; i++) { | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |         int row = B_rowIndex[i]; | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |         int col = B_colIndex[i]; | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |         B[row][col] = B_values[i]; | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     } | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | 
 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | 
 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     matmul_optimized(A,B,C,n); | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | 
 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     for (int i=0;i<n;i++) { | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |         for (int j=0;j<n;j++) { | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |             if (C[i][j]!=0) { | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |                 *C_nonZeroCount++; | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |                 *C_values=C[i][j];C_values++; | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |                 *C_colIndex=i;C_colIndex++; | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |                 *C_rowIndex=i;C_rowIndex++; | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |             } | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |         } | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     } | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     for (int i = 0; i< n; ++i) { | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |         free(A[i]); | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |         free(B[i]); | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |         free(C[i]); | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     } | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | 
 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     free(A); | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     free(B); | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     free(C); | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | } | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | 
 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | int main(){ | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     test_vector_add(vector_add,"正常"); | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     test_vector_add(vector_add_optimized,"优化"); | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | 
 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     test_matmul(matmul,"正常"); | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     test_matmul(matmul_optimized,"优化"); | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | 
 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     test_sparse_matmul(sparse_matmul_coo,"正常"); | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     test_sparse_matmul(sparse_matmul_coo_optimized,"优化"); | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | 
 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     return 0; | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | } |