|
|
|
|
#include <stdio.h>
|
|
|
|
|
#include <time.h>
|
|
|
|
|
#include <stdlib.h>
|
|
|
|
|
#include <arm_neon.h>
|
|
|
|
|
|
|
|
|
|
#define ROW 4
|
|
|
|
|
#define COL 4
|
|
|
|
|
#define MAX 16
|
|
|
|
|
|
|
|
|
|
typedef void (*vector_add_func)(float* , float* , float* , int );
|
|
|
|
|
typedef void (*matmul_func)(float** ,float** ,float** ,int );
|
|
|
|
|
typedef void (*spare_matmul_func)(float*, int*, int*, int, float*, int*, int*, int, float*, int*, int*, int*);
|
|
|
|
|
|
|
|
|
|
void test_vector_add(vector_add_func func,const char * attributive){
|
|
|
|
|
int size=1024;
|
|
|
|
|
float *A = malloc(size * sizeof(float ));
|
|
|
|
|
float *B = malloc(size * sizeof(float ));
|
|
|
|
|
float *C = malloc(size * sizeof(float ));
|
|
|
|
|
|
|
|
|
|
for (int i=0;i<size;i++) {
|
|
|
|
|
A[i]=rand()%100;
|
|
|
|
|
B[i]=rand()%100;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
clock_t start = clock();
|
|
|
|
|
func(A,B,C,size);
|
|
|
|
|
clock_t end = clock();
|
|
|
|
|
printf("%s向量加法耗时%lf秒\n",attributive,(double)(end-start)/CLOCKS_PER_SEC);
|
|
|
|
|
|
|
|
|
|
free(A);
|
|
|
|
|
free(B);
|
|
|
|
|
free(C);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void vector_add(float* A, float* B, float* C, int size) {
|
|
|
|
|
for (int i = 0;i< size;++i){
|
|
|
|
|
//加载A和B向量的4个浮点数到NEON寄存器
|
|
|
|
|
C[i]=A[i]+B[i];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void vector_add_optimized(float* A, float* B, float* C, int size) {
|
|
|
|
|
for (int i = 0;i< size; i+= 4){
|
|
|
|
|
//加载A和B向量的4个浮点数到NEON寄存器
|
|
|
|
|
float32x4_t vecA = vld1q_f32(&A[i]);
|
|
|
|
|
float32x4_t vecB = vld1q_f32(&B[i]);
|
|
|
|
|
|
|
|
|
|
float32x4_t vecC =vaddq_f32(vecA,vecB);
|
|
|
|
|
//将结果存储到c向量
|
|
|
|
|
vst1q_f32(&C[i], vecC);
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void test_matmul(matmul_func func,const char * attributive) {
|
|
|
|
|
const int n=1024;
|
|
|
|
|
float **A = malloc(n * sizeof(float *));
|
|
|
|
|
float **B = malloc(n * sizeof(float *));
|
|
|
|
|
float **C = malloc(n * sizeof(float *));
|
|
|
|
|
for (int i = 0; i< n; ++i) {
|
|
|
|
|
A[i] = malloc(n * sizeof(float *));
|
|
|
|
|
B[i] = malloc(n * sizeof(float *));
|
|
|
|
|
C[i] = malloc(n * sizeof(float *));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (int i=0;i<n;i++) {
|
|
|
|
|
for (int j=0;j<n;j++) {
|
|
|
|
|
A[i][j]=rand()%100;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
clock_t start = clock();
|
|
|
|
|
func(A,B,C,n);
|
|
|
|
|
clock_t end = clock();
|
|
|
|
|
printf("%s稠密向量乘法耗时%lf秒\n",attributive,(double)(end-start)/CLOCKS_PER_SEC);
|
|
|
|
|
|
|
|
|
|
for (int i = 0; i< n; ++i) {
|
|
|
|
|
free(A[i]);
|
|
|
|
|
free(B[i]);
|
|
|
|
|
free(C[i]);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
free(A);
|
|
|
|
|
free(B);
|
|
|
|
|
free(C);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void matmul(float** A,float** B,float** C,int n){
|
|
|
|
|
for (int i = 0; i< n;++i){
|
|
|
|
|
for (int j = 0; j< n; ++j){
|
|
|
|
|
C[i][j] =0;
|
|
|
|
|
for (int k = 0; k< n; ++k) {
|
|
|
|
|
C[i][j] += A[i][k] * B[k][j];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void matmul_optimized(float** A,float** B,float** C,int n){
|
|
|
|
|
//疑似还要对B矩阵转置
|
|
|
|
|
for (int i = 0; i< n;++i){
|
|
|
|
|
for (int j = 0; j< n; ++j){
|
|
|
|
|
float32x4_t vecC=vdupq_n_f32(0.0);
|
|
|
|
|
for (int k = 0; k< n; k+=4) {
|
|
|
|
|
float32x4_t vecA = vld1q_f32(&A[i][k]);
|
|
|
|
|
float32x4_t vecB = vld1q_f32(&B[k][j]);
|
|
|
|
|
vecC = vmlaq_f32(vecC, vecA, vecB);
|
|
|
|
|
}
|
|
|
|
|
C[i][j] = vgetq_lane_f32(vecC, 0) + vgetq_lane_f32(vecC, 1) + vgetq_lane_f32(vecC, 2) + vgetq_lane_f32(vecC, 3);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void test_sparse_matmul(spare_matmul_func func,const char * attributive) {
|
|
|
|
|
|
|
|
|
|
float A_values[] = {1, 2, 3, 4, 5};
|
|
|
|
|
int A_rowIndex[] = {0, 0,1, 2, 2};
|
|
|
|
|
int A_colIndex[] = {0, 2, 1, 0, 2};
|
|
|
|
|
int A_nonZeroCount = 5;
|
|
|
|
|
//矩阵B的COO格式
|
|
|
|
|
float B_values[] = {6, 8, 7, 9};
|
|
|
|
|
int B_rowIndex[] = {0,2, 1, 2};
|
|
|
|
|
int B_colIndex[] = {0, 0, 1, 2};
|
|
|
|
|
int B_nonZeroCount = 4;
|
|
|
|
|
//结果矩阵C的coo格式
|
|
|
|
|
float C_values[MAX];
|
|
|
|
|
int C_rowIndex[MAX];
|
|
|
|
|
int C_colIndex[MAX];
|
|
|
|
|
int C_nonZeroCount = 0;
|
|
|
|
|
|
|
|
|
|
clock_t start = clock();
|
|
|
|
|
func(A_values,A_rowIndex,A_colIndex,A_nonZeroCount,B_values,B_rowIndex,B_colIndex,B_nonZeroCount,C_values,C_rowIndex,C_colIndex,&C_nonZeroCount);
|
|
|
|
|
clock_t end = clock();
|
|
|
|
|
printf("%s稀疏向量乘法耗时%lf秒\n",attributive,(double)(end-start)/CLOCKS_PER_SEC);
|
|
|
|
|
}
|
|
|
|
|
void sparse_matmul_coo( float* A_values, int* A_rowIndex, int* A_colIndex, int A_nonZeroCount,
|
|
|
|
|
float* B_values,int* B_rowIndex,int* B_colIndex, int B_nonZeroCount,
|
|
|
|
|
float* C_values, int* C_rowIndex, int* C_colIndex, int* C_nonZeroCount) {
|
|
|
|
|
int currentIndex = 0;
|
|
|
|
|
//遍历A的非零元素
|
|
|
|
|
for (int i = 0; i<A_nonZeroCount; i++) {
|
|
|
|
|
int colA = A_colIndex[i];int rowA = A_rowIndex[i];
|
|
|
|
|
float valueA = A_values[i];
|
|
|
|
|
//遍历B的非零元素
|
|
|
|
|
for (int j=0;j<B_nonZeroCount;j++) {
|
|
|
|
|
int rowB = B_rowIndex[j];
|
|
|
|
|
int colB = B_colIndex[j];
|
|
|
|
|
float valueB = B_values[j];
|
|
|
|
|
//如果A的列和B的行匹配,则计算乘积并存储结果
|
|
|
|
|
if (colA == rowB) {
|
|
|
|
|
float product = valueA * valueB;
|
|
|
|
|
//检查是否已有此(rowA,colB)项
|
|
|
|
|
int found = 0;
|
|
|
|
|
for (int k = 0;k< currentIndex; k++) {
|
|
|
|
|
if (C_rowIndex[k] == rowA && C_colIndex[k] == colB){
|
|
|
|
|
C_values[k] += product;
|
|
|
|
|
found = 1;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if (!found){
|
|
|
|
|
//添加新的非零元素
|
|
|
|
|
C_values[currentIndex] = product;
|
|
|
|
|
C_rowIndex[currentIndex] = rowA;
|
|
|
|
|
C_colIndex[currentIndex] = colB;
|
|
|
|
|
currentIndex++;
|
|
|
|
|
}
|
|
|
|
|
//更新非零元素数量
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
*C_nonZeroCount =currentIndex;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void sparse_matmul_coo_optimized( float* A_values, int* A_rowIndex, int* A_colIndex, int A_nonZeroCount,
|
|
|
|
|
float* B_values,int* B_rowIndex,int* B_colIndex, int B_nonZeroCount,
|
|
|
|
|
float* C_values, int* C_rowIndex, int* C_colIndex, int* C_nonZeroCount) {
|
|
|
|
|
|
|
|
|
|
const int n=4;
|
|
|
|
|
float **A = malloc(n * sizeof(float *));
|
|
|
|
|
float **B = malloc(n * sizeof(float *));
|
|
|
|
|
float **C = malloc(n * sizeof(float *));
|
|
|
|
|
for (int i = 0; i< n; ++i) {
|
|
|
|
|
A[i] = malloc(n * sizeof(float *));
|
|
|
|
|
B[i] = malloc(n * sizeof(float *));
|
|
|
|
|
C[i] = malloc(n * sizeof(float *));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (int i = 0; i < A_nonZeroCount; i++) {
|
|
|
|
|
int row = A_rowIndex[i];
|
|
|
|
|
int col = A_colIndex[i];
|
|
|
|
|
A[row][col] = A_values[i];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (int i = 0; i < B_nonZeroCount; i++) {
|
|
|
|
|
int row = B_rowIndex[i];
|
|
|
|
|
int col = B_colIndex[i];
|
|
|
|
|
B[row][col] = B_values[i];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
matmul_optimized(A,B,C,n);
|
|
|
|
|
|
|
|
|
|
for (int i=0;i<n;i++) {
|
|
|
|
|
for (int j=0;j<n;j++) {
|
|
|
|
|
if (C[i][j]!=0) {
|
|
|
|
|
*C_nonZeroCount++;
|
|
|
|
|
*C_values=C[i][j];C_values++;
|
|
|
|
|
*C_colIndex=i;C_colIndex++;
|
|
|
|
|
*C_rowIndex=i;C_rowIndex++;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
for (int i = 0; i< n; ++i) {
|
|
|
|
|
free(A[i]);
|
|
|
|
|
free(B[i]);
|
|
|
|
|
free(C[i]);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
free(A);
|
|
|
|
|
free(B);
|
|
|
|
|
free(C);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int main(){
|
|
|
|
|
test_vector_add(vector_add,"正常");
|
|
|
|
|
test_vector_add(vector_add_optimized,"优化");
|
|
|
|
|
|
|
|
|
|
test_matmul(matmul,"正常");
|
|
|
|
|
test_matmul(matmul_optimized,"优化");
|
|
|
|
|
|
|
|
|
|
test_sparse_matmul(sparse_matmul_coo,"正常");
|
|
|
|
|
test_sparse_matmul(sparse_matmul_coo_optimized,"优化");
|
|
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
}
|