|
|
|
@ -0,0 +1,171 @@
|
|
|
|
|
#include <stdio.h>
|
|
|
|
|
#include <stdlib.h>
|
|
|
|
|
#include <time.h>
|
|
|
|
|
#include <arm_neon.h>
|
|
|
|
|
|
|
|
|
|
// 预定义矩阵的行数和列数
|
|
|
|
|
#define ROWS 10
|
|
|
|
|
#define COLS 10
|
|
|
|
|
|
|
|
|
|
// 稀疏矩阵结构体
|
|
|
|
|
typedef struct {
|
|
|
|
|
float *values;
|
|
|
|
|
int *rowIndex;
|
|
|
|
|
int *colIndex;
|
|
|
|
|
int nonZeroCount;
|
|
|
|
|
} SparseMatrix;
|
|
|
|
|
|
|
|
|
|
// 将稀疏矩阵转换为普通矩阵
|
|
|
|
|
void sparseToDense(SparseMatrix *sparse, float denseMatrix[ROWS][COLS]) {
|
|
|
|
|
for (int i = 0; i < ROWS; i++) {
|
|
|
|
|
for (int j = 0; j < COLS; j++) {
|
|
|
|
|
denseMatrix[i][j] = 0;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (int k = 0; k < sparse->nonZeroCount; k++) {
|
|
|
|
|
int row = sparse->rowIndex[k];
|
|
|
|
|
int col = sparse->colIndex[k];
|
|
|
|
|
denseMatrix[row][col] = sparse->values[k];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// NEON优化的矩阵乘法函数
|
|
|
|
|
void matmul_optimized(float A[ROWS][COLS], float B[ROWS][COLS], float C[ROWS][COLS]) {
|
|
|
|
|
for (int i = 0; i < ROWS; i++) {
|
|
|
|
|
for (int j = 0; j < COLS; j++) {
|
|
|
|
|
// 初始化结果矩阵C的当前元素为0
|
|
|
|
|
C[i][j] = 0;
|
|
|
|
|
|
|
|
|
|
// 用于累加的向量寄存器,初始化为0
|
|
|
|
|
float32x4_t vecC = vdupq_n_f32(0);
|
|
|
|
|
|
|
|
|
|
for (int k = 0; k < ROWS; k += 4) {
|
|
|
|
|
// 加载矩阵A的一行中的4个连续元素到向量寄存器
|
|
|
|
|
float32x4_t vecA = vld1q_f32(&A[i][k]);
|
|
|
|
|
// 加载矩阵B的一列中的4个连续元素到向量寄存器(注意这里要转置的逻辑,实际是按列取元素)
|
|
|
|
|
float32x4_t vecB = vld1q_f32(&B[k][j]);
|
|
|
|
|
|
|
|
|
|
// 对应元素相乘并累加到vecC
|
|
|
|
|
vecC = vmlaq_f32(vecC, vecA, vecB);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// 将累加结果从向量寄存器提取并累加到C[i][j]
|
|
|
|
|
C[i][j] += vgetq_lane_f32(vecC, 0) + vgetq_lane_f32(vecC, 1) + vgetq_lane_f32(vecC, 2) + vgetq_lane_f32(vecC, 3);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// 打印矩阵
|
|
|
|
|
void printMatrix(float matrix[ROWS][COLS]) {
|
|
|
|
|
for (int i = 0; i < ROWS; i++) {
|
|
|
|
|
for (int j = 0; j < COLS; j++) {
|
|
|
|
|
printf("%f ", matrix[i][j]);
|
|
|
|
|
}
|
|
|
|
|
printf("\n");
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int main() {
|
|
|
|
|
// 初始化稀疏矩阵A
|
|
|
|
|
SparseMatrix sparseA;
|
|
|
|
|
sparseA.nonZeroCount = 5;
|
|
|
|
|
sparseA.values = (float *)malloc(sparseA.nonZeroCount * sizeof(float));
|
|
|
|
|
sparseA.rowIndex = (int *)malloc(sparseA.nonZeroCount * sizeof(int));
|
|
|
|
|
sparseA.colIndex = (int *)malloc(sparseA.nonZeroCount * sizeof(int));
|
|
|
|
|
|
|
|
|
|
if (sparseA.values == NULL || sparseA.rowIndex == NULL || sparseA.colIndex == NULL) {
|
|
|
|
|
free(sparseA.values);
|
|
|
|
|
free(sparseA.rowIndex);
|
|
|
|
|
free(sparseA.colIndex);
|
|
|
|
|
fprintf(stderr, "Memory allocation failed for sparse matrix A!\n");
|
|
|
|
|
return 1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
sparseA.values[0] = 1;
|
|
|
|
|
sparseA.values[1] = 2;
|
|
|
|
|
sparseA.values[2] = 3;
|
|
|
|
|
sparseA.values[3] = 4;
|
|
|
|
|
sparseA.values[4] = 5;
|
|
|
|
|
|
|
|
|
|
sparseA.rowIndex[0] = 0;
|
|
|
|
|
sparseA.rowIndex[1] = 1;
|
|
|
|
|
sparseA.rowIndex[2] = 2;
|
|
|
|
|
sparseA.rowIndex[3] = 1;
|
|
|
|
|
sparseA.rowIndex[4] = 2;
|
|
|
|
|
|
|
|
|
|
sparseA.colIndex[0] = 0;
|
|
|
|
|
sparseA.colIndex[1] = 1;
|
|
|
|
|
sparseA.colIndex[2] = 2;
|
|
|
|
|
sparseA.colIndex[3] = 0;
|
|
|
|
|
sparseA.colIndex[4] = 1;
|
|
|
|
|
|
|
|
|
|
// 初始化稀疏矩阵B
|
|
|
|
|
SparseMatrix sparseB;
|
|
|
|
|
sparseB.nonZeroCount = 4;
|
|
|
|
|
sparseB.values = (float *)malloc(sparseB.nonZeroCount * sizeof(float));
|
|
|
|
|
sparseB.rowIndex = (int *)malloc(sparseB.nonZeroCount * sizeof(int));
|
|
|
|
|
sparseB.colIndex = (int *)malloc(sparseB.nonZeroCount * sizeof(int));
|
|
|
|
|
|
|
|
|
|
if (sparseB.values == NULL || sparseB.rowIndex == NULL || sparseB.colIndex == NULL) {
|
|
|
|
|
free(sparseB.values);
|
|
|
|
|
free(sparseB.rowIndex);
|
|
|
|
|
free(sparseB.colIndex);
|
|
|
|
|
fprintf(stderr, "Memory allocation failed for sparse matrix B!\n");
|
|
|
|
|
return 1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
sparseB.values[0] = 2;
|
|
|
|
|
sparseB.values[1] = 3;
|
|
|
|
|
sparseB.values[2] = 4;
|
|
|
|
|
sparseB.values[3] = 5;
|
|
|
|
|
|
|
|
|
|
sparseB.rowIndex[0] = 0;
|
|
|
|
|
sparseB.rowIndex[1] = 1;
|
|
|
|
|
sparseB.rowIndex[2] = 1;
|
|
|
|
|
sparseB.rowIndex[3] = 2;
|
|
|
|
|
|
|
|
|
|
sparseB.colIndex[0] = 0;
|
|
|
|
|
sparseB.colIndex[1] = 1;
|
|
|
|
|
sparseB.colIndex[2] = 0;
|
|
|
|
|
sparseB.colIndex[3] = 1;
|
|
|
|
|
|
|
|
|
|
// 声明并初始化常规矩阵
|
|
|
|
|
float denseA[ROWS][COLS];
|
|
|
|
|
float denseB[ROWS][COLS];
|
|
|
|
|
float denseC[ROWS][COLS];
|
|
|
|
|
|
|
|
|
|
// 将稀疏矩阵A转换为常规矩阵denseA
|
|
|
|
|
sparseToDense(&sparseA, denseA);
|
|
|
|
|
|
|
|
|
|
// 将稀疏矩阵B转换为常规矩阵denseB
|
|
|
|
|
sparseToDense(&sparseB, denseB);
|
|
|
|
|
|
|
|
|
|
// 打印转换后的常规矩阵A
|
|
|
|
|
printf("matrixA:\n");
|
|
|
|
|
printMatrix(denseA);
|
|
|
|
|
|
|
|
|
|
// 打印转换后的常规矩阵B
|
|
|
|
|
printf("matrixB:\n");
|
|
|
|
|
printMatrix(denseB);
|
|
|
|
|
|
|
|
|
|
// 记录NEON优化的矩阵乘法运行时间
|
|
|
|
|
clock_t startMul, endMul;
|
|
|
|
|
startMul = clock();
|
|
|
|
|
matmul_optimized(denseA, denseB, denseC);
|
|
|
|
|
endMul = clock();
|
|
|
|
|
double time_taken_Mul = ((double)(endMul - startMul)) / CLOCKS_PER_SEC;
|
|
|
|
|
printf("NEON time: %lfs\n", time_taken_Mul);
|
|
|
|
|
|
|
|
|
|
// 释放稀疏矩阵A的内存
|
|
|
|
|
free(sparseA.values);
|
|
|
|
|
free(sparseA.rowIndex);
|
|
|
|
|
free(sparseA.colIndex);
|
|
|
|
|
|
|
|
|
|
// 释放稀疏矩阵B的内存
|
|
|
|
|
free(sparseB.values);
|
|
|
|
|
free(sparseB.rowIndex);
|
|
|
|
|
free(sparseB.colIndex);
|
|
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
}
|