You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

236 lines
7.1 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <arm_neon.h>
#define ROW 4
#define COL 4
#define MAX 16
typedef void (*vector_add_func)(float* , float* , float* , int );
typedef void (*matmul_func)(float** ,float** ,float** ,int );
typedef void (*spare_matmul_func)(float*, int*, int*, int, float*, int*, int*, int, float*, int*, int*, int*);
void test_vector_add(vector_add_func func,const char * attributive){
int size=1024;
float *A = malloc(size * sizeof(float ));
float *B = malloc(size * sizeof(float ));
float *C = malloc(size * sizeof(float ));
for (int i=0;i<size;i++) {
A[i]=rand()%100;
B[i]=rand()%100;
}
clock_t start = clock();
func(A,B,C,size);
clock_t end = clock();
printf("%s向量加法耗时%lf秒\n",attributive,(double)(end-start)/CLOCKS_PER_SEC);
free(A);
free(B);
free(C);
}
void vector_add(float* A, float* B, float* C, int size) {
for (int i = 0;i< size;++i){
//加载A和B向量的4个浮点数到NEON寄存器
C[i]=A[i]+B[i];
}
}
void vector_add_optimized(float* A, float* B, float* C, int size) {
for (int i = 0;i< size; i+= 4){
//加载A和B向量的4个浮点数到NEON寄存器
float32x4_t vecA = vld1q_f32(&A[i]);
float32x4_t vecB = vld1q_f32(&B[i]);
float32x4_t vecC =vaddq_f32(vecA,vecB);
//将结果存储到c向量
vst1q_f32(&C[i], vecC);
}
}
void test_matmul(matmul_func func,const char * attributive) {
const int n=1024;
float **A = malloc(n * sizeof(float *));
float **B = malloc(n * sizeof(float *));
float **C = malloc(n * sizeof(float *));
for (int i = 0; i< n; ++i) {
A[i] = malloc(n * sizeof(float *));
B[i] = malloc(n * sizeof(float *));
C[i] = malloc(n * sizeof(float *));
}
for (int i=0;i<n;i++) {
for (int j=0;j<n;j++) {
A[i][j]=rand()%100;
}
}
clock_t start = clock();
func(A,B,C,n);
clock_t end = clock();
printf("%s稠密向量乘法耗时%lf秒\n",attributive,(double)(end-start)/CLOCKS_PER_SEC);
for (int i = 0; i< n; ++i) {
free(A[i]);
free(B[i]);
free(C[i]);
}
free(A);
free(B);
free(C);
}
void matmul(float** A,float** B,float** C,int n){
for (int i = 0; i< n;++i){
for (int j = 0; j< n; ++j){
C[i][j] =0;
for (int k = 0; k< n; ++k) {
C[i][j] += A[i][k] * B[k][j];
}
}
}
}
void matmul_optimized(float** A,float** B,float** C,int n){
//疑似还要对B矩阵转置
for (int i = 0; i< n;++i){
for (int j = 0; j< n; ++j){
float32x4_t vecC=vdupq_n_f32(0.0);
for (int k = 0; k< n; k+=4) {
float32x4_t vecA = vld1q_f32(&A[i][k]);
float32x4_t vecB = vld1q_f32(&B[k][j]);
vecC = vmlaq_f32(vecC, vecA, vecB);
}
C[i][j] = vgetq_lane_f32(vecC, 0) + vgetq_lane_f32(vecC, 1) + vgetq_lane_f32(vecC, 2) + vgetq_lane_f32(vecC, 3);
}
}
}
void test_sparse_matmul(spare_matmul_func func,const char * attributive) {
float A_values[] = {1, 2, 3, 4, 5};
int A_rowIndex[] = {0, 0,1, 2, 2};
int A_colIndex[] = {0, 2, 1, 0, 2};
int A_nonZeroCount = 5;
//矩阵B的COO格式
float B_values[] = {6, 8, 7, 9};
int B_rowIndex[] = {0,2, 1, 2};
int B_colIndex[] = {0, 0, 1, 2};
int B_nonZeroCount = 4;
//结果矩阵C的coo格式
float C_values[MAX];
int C_rowIndex[MAX];
int C_colIndex[MAX];
int C_nonZeroCount = 0;
clock_t start = clock();
func(A_values,A_rowIndex,A_colIndex,A_nonZeroCount,B_values,B_rowIndex,B_colIndex,B_nonZeroCount,C_values,C_rowIndex,C_colIndex,&C_nonZeroCount);
clock_t end = clock();
printf("%s稀疏向量乘法耗时%lf秒\n",attributive,(double)(end-start)/CLOCKS_PER_SEC);
}
void sparse_matmul_coo( float* A_values, int* A_rowIndex, int* A_colIndex, int A_nonZeroCount,
float* B_values,int* B_rowIndex,int* B_colIndex, int B_nonZeroCount,
float* C_values, int* C_rowIndex, int* C_colIndex, int* C_nonZeroCount) {
int currentIndex = 0;
//遍历A的非零元素
for (int i = 0; i<A_nonZeroCount; i++) {
int colA = A_colIndex[i];int rowA = A_rowIndex[i];
float valueA = A_values[i];
//遍历B的非零元素
for (int j=0;j<B_nonZeroCount;j++) {
int rowB = B_rowIndex[j];
int colB = B_colIndex[j];
float valueB = B_values[j];
//如果A的列和B的行匹配则计算乘积并存储结果
if (colA == rowB) {
float product = valueA * valueB;
//检查是否已有此rowAcolB
int found = 0;
for (int k = 0;k< currentIndex; k++) {
if (C_rowIndex[k] == rowA && C_colIndex[k] == colB){
C_values[k] += product;
found = 1;
break;
}
}
if (!found){
//添加新的非零元素
C_values[currentIndex] = product;
C_rowIndex[currentIndex] = rowA;
C_colIndex[currentIndex] = colB;
currentIndex++;
}
//更新非零元素数量
}
}
}
*C_nonZeroCount =currentIndex;
}
void sparse_matmul_coo_optimized( float* A_values, int* A_rowIndex, int* A_colIndex, int A_nonZeroCount,
float* B_values,int* B_rowIndex,int* B_colIndex, int B_nonZeroCount,
float* C_values, int* C_rowIndex, int* C_colIndex, int* C_nonZeroCount) {
const int n=4;
float **A = malloc(n * sizeof(float *));
float **B = malloc(n * sizeof(float *));
float **C = malloc(n * sizeof(float *));
for (int i = 0; i< n; ++i) {
A[i] = malloc(n * sizeof(float *));
B[i] = malloc(n * sizeof(float *));
C[i] = malloc(n * sizeof(float *));
}
for (int i = 0; i < A_nonZeroCount; i++) {
int row = A_rowIndex[i];
int col = A_colIndex[i];
A[row][col] = A_values[i];
}
for (int i = 0; i < B_nonZeroCount; i++) {
int row = B_rowIndex[i];
int col = B_colIndex[i];
B[row][col] = B_values[i];
}
matmul_optimized(A,B,C,n);
for (int i=0;i<n;i++) {
for (int j=0;j<n;j++) {
if (C[i][j]!=0) {
*C_nonZeroCount++;
*C_values=C[i][j];C_values++;
*C_colIndex=i;C_colIndex++;
*C_rowIndex=i;C_rowIndex++;
}
}
}
for (int i = 0; i< n; ++i) {
free(A[i]);
free(B[i]);
free(C[i]);
}
free(A);
free(B);
free(C);
}
int main(){
test_vector_add(vector_add,"正常");
test_vector_add(vector_add_optimized,"优化");
test_matmul(matmul,"正常");
test_matmul(matmul_optimized,"优化");
test_sparse_matmul(sparse_matmul_coo,"正常");
test_sparse_matmul(sparse_matmul_coo_optimized,"优化");
return 0;
}