ADD file via upload

main
pi7mcrg2k 1 week ago
parent 4b3629f775
commit fc89ff89cf

@ -0,0 +1,236 @@
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <arm_neon.h>
#define ROW 4
#define COL 4
#define MAX 16
typedef void (*vector_add_func)(float* , float* , float* , int );
typedef void (*matmul_func)(float** ,float** ,float** ,int );
typedef void (*spare_matmul_func)(float*, int*, int*, int, float*, int*, int*, int, float*, int*, int*, int*);
void test_vector_add(vector_add_func func,const char * attributive){
int size=1024;
float *A = malloc(size * sizeof(float ));
float *B = malloc(size * sizeof(float ));
float *C = malloc(size * sizeof(float ));
for (int i=0;i<size;i++) {
A[i]=rand()%100;
B[i]=rand()%100;
}
clock_t start = clock();
func(A,B,C,size);
clock_t end = clock();
printf("%s向量加法耗时%lf秒\n",attributive,(double)(end-start)/CLOCKS_PER_SEC);
free(A);
free(B);
free(C);
}
void vector_add(float* A, float* B, float* C, int size) {
for (int i = 0;i< size;++i){
//加载A和B向量的4个浮点数到NEON寄存器
C[i]=A[i]+B[i];
}
}
void vector_add_optimized(float* A, float* B, float* C, int size) {
for (int i = 0;i< size; i+= 4){
//加载A和B向量的4个浮点数到NEON寄存器
float32x4_t vecA = vld1q_f32(&A[i]);
float32x4_t vecB = vld1q_f32(&B[i]);
float32x4_t vecC =vaddq_f32(vecA,vecB);
//将结果存储到c向量
vst1q_f32(&C[i], vecC);
}
}
void test_matmul(matmul_func func,const char * attributive) {
const int n=1024;
float **A = malloc(n * sizeof(float *));
float **B = malloc(n * sizeof(float *));
float **C = malloc(n * sizeof(float *));
for (int i = 0; i< n; ++i) {
A[i] = malloc(n * sizeof(float *));
B[i] = malloc(n * sizeof(float *));
C[i] = malloc(n * sizeof(float *));
}
for (int i=0;i<n;i++) {
for (int j=0;j<n;j++) {
A[i][j]=rand()%100;
}
}
clock_t start = clock();
func(A,B,C,n);
clock_t end = clock();
printf("%s稠密向量乘法耗时%lf秒\n",attributive,(double)(end-start)/CLOCKS_PER_SEC);
for (int i = 0; i< n; ++i) {
free(A[i]);
free(B[i]);
free(C[i]);
}
free(A);
free(B);
free(C);
}
void matmul(float** A,float** B,float** C,int n){
for (int i = 0; i< n;++i){
for (int j = 0; j< n; ++j){
C[i][j] =0;
for (int k = 0; k< n; ++k) {
C[i][j] += A[i][k] * B[k][j];
}
}
}
}
void matmul_optimized(float** A,float** B,float** C,int n){
//疑似还要对B矩阵转置
for (int i = 0; i< n;++i){
for (int j = 0; j< n; ++j){
float32x4_t vecC=vdupq_n_f32(0.0);
for (int k = 0; k< n; k+=4) {
float32x4_t vecA = vld1q_f32(&A[i][k]);
float32x4_t vecB = vld1q_f32(&B[k][j]);
vecC = vmlaq_f32(vecC, vecA, vecB);
}
C[i][j] = vgetq_lane_f32(vecC, 0) + vgetq_lane_f32(vecC, 1) + vgetq_lane_f32(vecC, 2) + vgetq_lane_f32(vecC, 3);
}
}
}
void test_sparse_matmul(spare_matmul_func func,const char * attributive) {
float A_values[] = {1, 2, 3, 4, 5};
int A_rowIndex[] = {0, 0,1, 2, 2};
int A_colIndex[] = {0, 2, 1, 0, 2};
int A_nonZeroCount = 5;
//矩阵B的COO格式
float B_values[] = {6, 8, 7, 9};
int B_rowIndex[] = {0,2, 1, 2};
int B_colIndex[] = {0, 0, 1, 2};
int B_nonZeroCount = 4;
//结果矩阵C的coo格式
float C_values[MAX];
int C_rowIndex[MAX];
int C_colIndex[MAX];
int C_nonZeroCount = 0;
clock_t start = clock();
func(A_values,A_rowIndex,A_colIndex,A_nonZeroCount,B_values,B_rowIndex,B_colIndex,B_nonZeroCount,C_values,C_rowIndex,C_colIndex,&C_nonZeroCount);
clock_t end = clock();
printf("%s稀疏向量乘法耗时%lf秒\n",attributive,(double)(end-start)/CLOCKS_PER_SEC);
}
void sparse_matmul_coo( float* A_values, int* A_rowIndex, int* A_colIndex, int A_nonZeroCount,
float* B_values,int* B_rowIndex,int* B_colIndex, int B_nonZeroCount,
float* C_values, int* C_rowIndex, int* C_colIndex, int* C_nonZeroCount) {
int currentIndex = 0;
//遍历A的非零元素
for (int i = 0; i<A_nonZeroCount; i++) {
int colA = A_colIndex[i];int rowA = A_rowIndex[i];
float valueA = A_values[i];
//遍历B的非零元素
for (int j=0;j<B_nonZeroCount;j++) {
int rowB = B_rowIndex[j];
int colB = B_colIndex[j];
float valueB = B_values[j];
//如果A的列和B的行匹配则计算乘积并存储结果
if (colA == rowB) {
float product = valueA * valueB;
//检查是否已有此rowAcolB
int found = 0;
for (int k = 0;k< currentIndex; k++) {
if (C_rowIndex[k] == rowA && C_colIndex[k] == colB){
C_values[k] += product;
found = 1;
break;
}
}
if (!found){
//添加新的非零元素
C_values[currentIndex] = product;
C_rowIndex[currentIndex] = rowA;
C_colIndex[currentIndex] = colB;
currentIndex++;
}
//更新非零元素数量
}
}
}
*C_nonZeroCount =currentIndex;
}
void sparse_matmul_coo_optimized( float* A_values, int* A_rowIndex, int* A_colIndex, int A_nonZeroCount,
float* B_values,int* B_rowIndex,int* B_colIndex, int B_nonZeroCount,
float* C_values, int* C_rowIndex, int* C_colIndex, int* C_nonZeroCount) {
const int n=4;
float **A = malloc(n * sizeof(float *));
float **B = malloc(n * sizeof(float *));
float **C = malloc(n * sizeof(float *));
for (int i = 0; i< n; ++i) {
A[i] = malloc(n * sizeof(float *));
B[i] = malloc(n * sizeof(float *));
C[i] = malloc(n * sizeof(float *));
}
for (int i = 0; i < A_nonZeroCount; i++) {
int row = A_rowIndex[i];
int col = A_colIndex[i];
A[row][col] = A_values[i];
}
for (int i = 0; i < B_nonZeroCount; i++) {
int row = B_rowIndex[i];
int col = B_colIndex[i];
B[row][col] = B_values[i];
}
matmul_optimized(A,B,C,n);
for (int i=0;i<n;i++) {
for (int j=0;j<n;j++) {
if (C[i][j]!=0) {
*C_nonZeroCount++;
*C_values=C[i][j];C_values++;
*C_colIndex=i;C_colIndex++;
*C_rowIndex=i;C_rowIndex++;
}
}
}
for (int i = 0; i< n; ++i) {
free(A[i]);
free(B[i]);
free(C[i]);
}
free(A);
free(B);
free(C);
}
int main(){
test_vector_add(vector_add,"正常");
test_vector_add(vector_add_optimized,"优化");
test_matmul(matmul,"正常");
test_matmul(matmul_optimized,"优化");
test_sparse_matmul(sparse_matmul_coo,"正常");
test_sparse_matmul(sparse_matmul_coo_optimized,"优化");
return 0;
}
Loading…
Cancel
Save