ADD file via upload

main
paz4r2p6k 8 months ago
parent 339ac48183
commit 666a5397c4

@ -0,0 +1,171 @@
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <arm_neon.h>
// 预定义矩阵的行数和列数
#define ROWS 10
#define COLS 10
// 稀疏矩阵结构体
typedef struct {
float *values;
int *rowIndex;
int *colIndex;
int nonZeroCount;
} SparseMatrix;
// 将稀疏矩阵转换为普通矩阵
void sparseToDense(SparseMatrix *sparse, float denseMatrix[ROWS][COLS]) {
for (int i = 0; i < ROWS; i++) {
for (int j = 0; j < COLS; j++) {
denseMatrix[i][j] = 0;
}
}
for (int k = 0; k < sparse->nonZeroCount; k++) {
int row = sparse->rowIndex[k];
int col = sparse->colIndex[k];
denseMatrix[row][col] = sparse->values[k];
}
}
// NEON优化的矩阵乘法函数
void matmul_optimized(float A[ROWS][COLS], float B[ROWS][COLS], float C[ROWS][COLS]) {
for (int i = 0; i < ROWS; i++) {
for (int j = 0; j < COLS; j++) {
// 初始化结果矩阵C的当前元素为0
C[i][j] = 0;
// 用于累加的向量寄存器初始化为0
float32x4_t vecC = vdupq_n_f32(0);
for (int k = 0; k < ROWS; k += 4) {
// 加载矩阵A的一行中的4个连续元素到向量寄存器
float32x4_t vecA = vld1q_f32(&A[i][k]);
// 加载矩阵B的一列中的4个连续元素到向量寄存器注意这里要转置的逻辑实际是按列取元素
float32x4_t vecB = vld1q_f32(&B[k][j]);
// 对应元素相乘并累加到vecC
vecC = vmlaq_f32(vecC, vecA, vecB);
}
// 将累加结果从向量寄存器提取并累加到C[i][j]
C[i][j] += vgetq_lane_f32(vecC, 0) + vgetq_lane_f32(vecC, 1) + vgetq_lane_f32(vecC, 2) + vgetq_lane_f32(vecC, 3);
}
}
}
// 打印矩阵
void printMatrix(float matrix[ROWS][COLS]) {
for (int i = 0; i < ROWS; i++) {
for (int j = 0; j < COLS; j++) {
printf("%f ", matrix[i][j]);
}
printf("\n");
}
}
int main() {
// 初始化稀疏矩阵A
SparseMatrix sparseA;
sparseA.nonZeroCount = 5;
sparseA.values = (float *)malloc(sparseA.nonZeroCount * sizeof(float));
sparseA.rowIndex = (int *)malloc(sparseA.nonZeroCount * sizeof(int));
sparseA.colIndex = (int *)malloc(sparseA.nonZeroCount * sizeof(int));
if (sparseA.values == NULL || sparseA.rowIndex == NULL || sparseA.colIndex == NULL) {
free(sparseA.values);
free(sparseA.rowIndex);
free(sparseA.colIndex);
fprintf(stderr, "Memory allocation failed for sparse matrix A!\n");
return 1;
}
sparseA.values[0] = 1;
sparseA.values[1] = 2;
sparseA.values[2] = 3;
sparseA.values[3] = 4;
sparseA.values[4] = 5;
sparseA.rowIndex[0] = 0;
sparseA.rowIndex[1] = 1;
sparseA.rowIndex[2] = 2;
sparseA.rowIndex[3] = 1;
sparseA.rowIndex[4] = 2;
sparseA.colIndex[0] = 0;
sparseA.colIndex[1] = 1;
sparseA.colIndex[2] = 2;
sparseA.colIndex[3] = 0;
sparseA.colIndex[4] = 1;
// 初始化稀疏矩阵B
SparseMatrix sparseB;
sparseB.nonZeroCount = 4;
sparseB.values = (float *)malloc(sparseB.nonZeroCount * sizeof(float));
sparseB.rowIndex = (int *)malloc(sparseB.nonZeroCount * sizeof(int));
sparseB.colIndex = (int *)malloc(sparseB.nonZeroCount * sizeof(int));
if (sparseB.values == NULL || sparseB.rowIndex == NULL || sparseB.colIndex == NULL) {
free(sparseB.values);
free(sparseB.rowIndex);
free(sparseB.colIndex);
fprintf(stderr, "Memory allocation failed for sparse matrix B!\n");
return 1;
}
sparseB.values[0] = 2;
sparseB.values[1] = 3;
sparseB.values[2] = 4;
sparseB.values[3] = 5;
sparseB.rowIndex[0] = 0;
sparseB.rowIndex[1] = 1;
sparseB.rowIndex[2] = 1;
sparseB.rowIndex[3] = 2;
sparseB.colIndex[0] = 0;
sparseB.colIndex[1] = 1;
sparseB.colIndex[2] = 0;
sparseB.colIndex[3] = 1;
// 声明并初始化常规矩阵
float denseA[ROWS][COLS];
float denseB[ROWS][COLS];
float denseC[ROWS][COLS];
// 将稀疏矩阵A转换为常规矩阵denseA
sparseToDense(&sparseA, denseA);
// 将稀疏矩阵B转换为常规矩阵denseB
sparseToDense(&sparseB, denseB);
// 打印转换后的常规矩阵A
printf("matrixA:\n");
printMatrix(denseA);
// 打印转换后的常规矩阵B
printf("matrixB:\n");
printMatrix(denseB);
// 记录NEON优化的矩阵乘法运行时间
clock_t startMul, endMul;
startMul = clock();
matmul_optimized(denseA, denseB, denseC);
endMul = clock();
double time_taken_Mul = ((double)(endMul - startMul)) / CLOCKS_PER_SEC;
printf("NEON time: %lfs\n", time_taken_Mul);
// 释放稀疏矩阵A的内存
free(sparseA.values);
free(sparseA.rowIndex);
free(sparseA.colIndex);
// 释放稀疏矩阵B的内存
free(sparseB.values);
free(sparseB.rowIndex);
free(sparseB.colIndex);
return 0;
}
Loading…
Cancel
Save