You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
pdc/算子优化系统4.cpp

95 lines
3.6 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

#include <stdio.h>
#include <stdlib.h>
#include <ctime>
// 包含NEON头文件启用NEON指令
#include <arm_neon.h>
// 定义矩阵大小
#define SIZE 1024
// 基础的矩阵乘法函数
void matmul(float** A, float** B, float** C, int n) {
for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
float sum = 0;
for (int k = 0; k < n; k++) {
sum += A[i][k] * B[k][j];
}
C[i][j] = sum;
}
}
}
// 使用NEON指令优化的矩阵乘法函数
void matmul_optimized(float** A, float** B, float** C, int n) {
for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
float32x4_t vecC = vdupq_n_f32(0); // 初始化结果向量为0
for (int k = 0; k < n; k += 4) {
// 向量加载将A和B的4个连续元素加载到float32x4_t类型的向量中
float32x4_t a_vec = vld1q_f32(&A[i][k]);
float32x4_t b_vec = vld1q_f32(&B[k][j]);
// 向量乘法和累加使用vmlaq_f32完成对应元素相乘并累加到vecC中
vecC = vmlaq_f32(vecC, a_vec, b_vec);
}
// 向量还原提取累加结果并存储到C[i][j]中
C[i][j] = vgetq_lane_f32(vecC, 0) + vgetq_lane_f32(vecC, 1) +
vgetq_lane_f32(vecC, 2) + vgetq_lane_f32(vecC, 3);
}
}
}
int main() {
// 动态分配两个输入矩阵A和B以及结果矩阵C的内存
float** A = (float**)malloc(SIZE * sizeof(float*));
for (int i = 0; i < SIZE; i++) {
A[i] = (float*)malloc(SIZE * sizeof(float));
}
float** B = (float**)malloc(SIZE * sizeof(float*));
for (int i = 0; i < SIZE; i++) {
B[i] = (float*)malloc(SIZE * sizeof(float));
}
float** C = (float**)malloc(SIZE * sizeof(float*));
for (int i = 0; i < SIZE; i++) {
C[i] = (float*)malloc(SIZE * sizeof(float));
}
float** C_optimized = (float**)malloc(SIZE * sizeof(float*));
for (int i = 0; i < SIZE; i++) {
C_optimized[i] = (float*)malloc(SIZE * sizeof(float));
}
// 初始化矩阵数据将A和B矩阵的每个元素随机初始化
for (int i = 0; i < SIZE; i++) {
for (int j = 0; j < SIZE; j++) {
A[i][j] = (float)(rand() % 100);
B[i][j] = (float)(rand() % 100);
}
}
// 测试基础矩阵乘法函数的运行时间
clock_t start_time_original = clock();
matmul(A, B, C, SIZE);
clock_t end_time_original = clock();
double elapsed_time_original = (double)(end_time_original - start_time_original) / CLOCKS_PER_SEC;
// 测试NEON优化后的矩阵乘法函数的运行时间
clock_t start_time_optimized = clock();
matmul_optimized(A, B, C_optimized, SIZE);
clock_t end_time_optimized = clock();
double elapsed_time_optimized = (double)(end_time_optimized - start_time_optimized) / CLOCKS_PER_SEC;
// 输出基础矩阵乘法的运行时间
printf("original time: %lf s\n", elapsed_time_original);
// 输出NEON优化后的矩阵乘法的运行时间
printf("NEON optimized time: %lf s\n", elapsed_time_optimized);
// 释放动态分配的内存空间
for (int i = 0; i < SIZE; i++) {
free(A[i]);
}
free(A);
for (int i = 0; i < SIZE; i++) {
free(B[i]);
}
free(B);
for (int i = 0; i < SIZE; i++) {
free(C[i]);
}
free(C);
for (int i = 0; i < SIZE; i++) {
free(C_optimized[i]);
}
free(C_optimized);
return 0;
}