ADD file via upload

main
pu7icvtwj 8 months ago
parent a62c3001dc
commit a8c42927e8

@ -0,0 +1,130 @@
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <arm_neon.h>
// 基础向量加法
void vector_add_base(float *a, float *b, float *c, int n) {
for (int i = 0; i < n; i++) {
c[i] = a[i] + b[i];
}
}
// NEON优化向量加法
void vector_add_neon(float *a, float *b, float *c, int n) {
int i;
for (i = 0; i <= n - 4; i += 4) {
float32x4_t va = vld1q_f32(&a[i]);
float32x4_t vb = vld1q_f32(&b[i]);
float32x4_t vc = vaddq_f32(va, vb);
vst1q_f32(&c[i], vc);
}
for (; i < n; i++) {
c[i] = a[i] + b[i];
}
}
// 基础矩阵乘法
void matrix_multiply_base(float *A, float *B, float *C, int N) {
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
float sum = 0.0f;
for (int k = 0; k < N; k++) {
sum += A[i * N + k] * B[k * N + j];
}
C[i * N + j] = sum;
}
}
}
// NEON优化矩阵乘法
void matrix_multiply_neon(float *A, float *B, float *C, int N) {
int i, j, k;
for (i = 0; i < N; i++) {
for (j = 0; j < N; j++) {
float32x4_t sum = vdupq_n_f32(0.0f);
for (k = 0; k <= N - 4; k += 4) {
float32x4_t va = vld1q_f32(&A[i * N + k]);
float32x4_t vb = vld1q_f32(&B[k * N + j]);
sum = vmlaq_f32(sum, va, vb);
}
C[i * N + j] = vgetq_lane_f32(sum, 0) + vgetq_lane_f32(sum, 1) +
vgetq_lane_f32(sum, 2) + vgetq_lane_f32(sum, 3);
// 处理剩余的元素
for (; k < N; k++) {
C[i * N + j] += A[i * N + k] * B[k * N + j];
}
}
}
}
// 初始化矩阵
void initialize_matrix(float *matrix, int N) {
for (int i = 0; i < N * N; i++) {
matrix[i] = (float)(rand() % 100) / 100.0f;
}
}
// 计算时间
double get_time() {
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return ts.tv_sec + ts.tv_nsec * 1e-9;
}
int main() {
int N = 1024; // 矩阵大小
float *a, *b, *c, *A, *B, *C;
// 分配内存
a = (float *)aligned_alloc(16, N * sizeof(float));
b = (float *)aligned_alloc(16, N * sizeof(float));
c = (float *)aligned_alloc(16, N * sizeof(float));
A = (float *)aligned_alloc(16, N * N * sizeof(float));
B = (float *)aligned_alloc(16, N * N * sizeof(float));
C = (float *)aligned_alloc(16, N * N * sizeof(float));
// 初始化向量和矩阵
for (int i = 0; i < N; i++) {
a[i] = (float)(rand() % 100) / 100.0f;
b[i] = (float)(rand() % 100) / 100.0f;
}
initialize_matrix(A, N);
initialize_matrix(B, N);
// 基础向量加法
double start_time = get_time();
vector_add_base(a, b, c, N);
double end_time = get_time();
printf("基础向量加法时间: %.6f 秒\n", end_time - start_time);
// NEON优化向量加法
start_time = get_time();
vector_add_neon(a, b, c, N);
end_time = get_time();
printf("NEON优化向量加法时间: %.6f 秒\n", end_time - start_time);
// 基础矩阵乘法
start_time = get_time();
matrix_multiply_base(A, B, C, N);
end_time = get_time();
printf("基础矩阵乘法时间: %.6f 秒\n", end_time - start_time);
// NEON优化矩阵乘法
start_time = get_time();
matrix_multiply_neon(A, B, C, N);
end_time = get_time();
printf("NEON优化矩阵乘法时间: %.6f 秒\n", end_time - start_time);
// 释放内存
free(a);
free(b);
free(c);
free(A);
free(B);
free(C);
return 0;
}
Loading…
Cancel
Save