You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

129 lines
3.0 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

#include <stdio.h>
#include <stdlib.h>
#include <ctime>
#include <arm_neon.h>
#define ROWS 1024
#define COLS 1024
void sparseToDense(float* values, int* rowIndex, int* colIndex, int nonZeroCount, float denseMatrix[ROWS][COLS]) {
for (int i = 0; i < ROWS; i++) {
for (int j = 0; j < COLS; j++) {
denseMatrix[i][j] = 0;
}
}
for (int k = 0; k < nonZeroCount; k++) {
int row = rowIndex[k];
int col = colIndex[k];
float value = values[k];
denseMatrix[row][col] = value;
}
}
void matmul_optimized(float** A, float** B, float** C, int n) {
for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
float32x4_t vecC = vdupq_n_f32(0);
for (int k = 0; k < n; k += 4) {
float32x4_t vecA = vld1q_f32(&A[i][k]);
float32x4_t vecB = vld1q_f32(&B[k][j]);
vecC = vmlaq_f32(vecC, vecA, vecB);
}
C[i][j] = vgetq_lane_f32(vecC, 0) + vgetq_lane_f32(vecC, 1) +
vgetq_lane_f32(vecC, 2) + vgetq_lane_f32(vecC, 3);
}
}
}
int main() {
float A_values[] = {1, 2, 3};
int A_rowIndex[] = {0, 1, 2};
int A_colIndex[] = {0, 1, 2};
int A_nonZeroCount = 3;
float B_values[] = {4, 5, 6};
int B_rowIndex[] = {0, 1, 2};
int B_colIndex[] = {0, 1, 2};
int B_nonZeroCount = 3;
float denseMatrixA[ROWS][COLS];
float denseMatrixB[ROWS][COLS];
float resultMatrix[ROWS][COLS];
sparseToDense(A_values, A_rowIndex, A_colIndex, A_nonZeroCount, denseMatrixA);
printf("常规矩阵A\n");
for (int i = 0; i < ROWS; i++) {
for (int j = 0; j < COLS; j++) {
printf("%f ", denseMatrixA[i][j]);
}
printf("\n");
}
sparseToDense(B_values, B_rowIndex, B_colIndex, B_nonZeroCount, denseMatrixB);
printf("常规矩阵B\n");
for (int i = 0; i < ROWS; i++) {
for (int j = 0; j < COLS; j++) {
printf("%f ", denseMatrixB[i][j]);
}
printf("\n");
}
float* matrixAPtr[ROWS];
for (int i = 0; i < ROWS; i++) {
matrixAPtr[i] = denseMatrixA[i];
}
float* matrixBPtr[ROWS];
for (int i = 0; i < ROWS; i++) {
matrixBPtr[i] = denseMatrixB[i];
}
float* resultMatrixPtr[ROWS];
for (int i = 0; i < ROWS; i++) {
resultMatrixPtr[i] = resultMatrix[i];
}
clock_t start_time, end_time;
start_time = clock();
matmul_optimized((float**)matrixAPtr, (float**)matrixBPtr, (float**)resultMatrixPtr, ROWS);
end_time = clock();
double elapsed_time = ((double)(end_time - start_time)) / CLOCKS_PER_SEC;
printf("优化的稀疏矩阵乘法使用NEON的运行时间%f 秒\n", elapsed_time);
printf("结果矩阵:\n");
for (int i = 0; i < ROWS; i++) {
for (int j = 0; j < COLS; j++) {
printf("%f ", resultMatrix[i][j]);
}
printf("\n");
}
return 0;
}