perf(fft): Replace DFT with Cooley-Tukey FFT (33x speedup)

- Implement iterative complex FFT in fft_utils.cpp (power-of-2 only)
- Replace O(n^2) DFT with O(n log n) FFT in compute_power_spectrum
- Validation time drops from ~3200ms to ~97ms total (~160ms -> ~5ms per file)
- All tests pass (test_core_lib 6/6, demo_offline 20/20)
zhaochang_branch
赵昌 1 day ago
parent 9911ad279d
commit 81879be4bc

@ -19,37 +19,63 @@ void apply_hann_window(float* data, size_t n) {
}
}
// Compute power spectrum via DFT.
// For n_fft = 2048 this is fast enough for offline/demo use.
// Twiddle factors are cached in a static lookup table per n_fft size.
void compute_power_spectrum(const float* frame, int n_fft, float* power_out) {
int bins = n_fft / 2 + 1;
namespace {
// Build or reuse cached twiddle table for this n_fft
static thread_local std::vector<float> cos_table;
static thread_local std::vector<float> sin_table;
static thread_local int cached_nfft = 0;
// Iterative Cooley-Tukey complex FFT (power-of-2 sizes only).
struct Cpx { float r = 0.0f, i = 0.0f; };
if (cached_nfft != n_fft) {
cached_nfft = n_fft;
cos_table.resize(static_cast<size_t>(n_fft));
sin_table.resize(static_cast<size_t>(n_fft));
for (int k = 0; k < n_fft; ++k) {
float angle = -2.0f * static_cast<float>(M_PI) * k / n_fft;
cos_table[k] = std::cos(angle);
sin_table[k] = std::sin(angle);
void fft_inplace(Cpx* data, int n, bool inverse) {
int j = 0;
for (int i = 1; i < n; ++i) {
int bit = n >> 1;
for (; j & bit; bit >>= 1) j ^= bit;
j ^= bit;
if (i < j) std::swap(data[i], data[j]);
}
for (int len = 2; len <= n; len <<= 1) {
float ang = 2.0f * static_cast<float>(M_PI) / len * (inverse ? 1.0f : -1.0f);
float wlen_r = std::cos(ang);
float wlen_i = std::sin(ang);
for (int i = 0; i < n; i += len) {
float wr = 1.0f, wi = 0.0f;
for (int k = 0; k < len / 2; ++k) {
int u = i + k;
int v = i + k + len / 2;
float ur = data[u].r, ui = data[u].i;
float vr = data[v].r * wr - data[v].i * wi;
float vi = data[v].r * wi + data[v].i * wr;
data[u].r = ur + vr;
data[u].i = ui + vi;
data[v].r = ur - vr;
data[v].i = ui - vi;
float wnr = wr * wlen_r - wi * wlen_i;
float wni = wr * wlen_i + wi * wlen_r;
wr = wnr;
wi = wni;
}
}
}
for (int b = 0; b < bins; ++b) {
float real = 0.0f;
float imag = 0.0f;
for (int n = 0; n < n_fft; ++n) {
int idx = (b * n) % n_fft;
real += frame[n] * cos_table[idx];
imag += frame[n] * sin_table[idx];
if (inverse) {
float inv_n = 1.0f / n;
for (int i = 0; i < n; ++i) {
data[i].r *= inv_n;
data[i].i *= inv_n;
}
power_out[b] = real * real + imag * imag;
}
}
} // anonymous namespace
void compute_power_spectrum(const float* frame, int n_fft, float* power_out) {
int bins = n_fft / 2 + 1;
std::vector<Cpx> data(n_fft);
for (int i = 0; i < n_fft; ++i) {
data[i].r = frame[i];
data[i].i = 0.0f;
}
fft_inplace(data.data(), n_fft, false);
for (int i = 0; i < bins; ++i) {
power_out[i] = data[i].r * data[i].r + data[i].i * data[i].i;
}
}

Loading…
Cancel
Save