|
|
|
|
@ -0,0 +1,180 @@
|
|
|
|
|
#!/usr/bin/env python
|
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
"""
|
|
|
|
|
有标注学习音频特征提取:读取“瓷空1.wav”(标注为“空”),提取五维特征+标签,保存MAT/PKL(适配深度学习)
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
from pathlib import Path
|
|
|
|
|
import numpy as np
|
|
|
|
|
import scipy.io.wavfile as wav
|
|
|
|
|
from scipy.io import savemat
|
|
|
|
|
from scipy.signal import hilbert
|
|
|
|
|
import librosa
|
|
|
|
|
import matplotlib.pyplot as plt
|
|
|
|
|
import os
|
|
|
|
|
import pickle # 用于保存PKL文件
|
|
|
|
|
|
|
|
|
|
plt.rcParams['font.sans-serif'] = ['SimHei'] # 中文显示正常
|
|
|
|
|
plt.rcParams['axes.unicode_minus'] = False # 负号显示正常
|
|
|
|
|
|
|
|
|
|
# ---------- 参数设定(核心:指定有标注的学习音频路径,标签自动识别) ----------
|
|
|
|
|
WAV_FILE = r"D:\SummerSchool\sample\瓷空1.wav" # 有标注的学习音频(文件名含“空”,自动识别标签)
|
|
|
|
|
WIN_SIZE = 1024 # 帧长(与测试音频代码一致)
|
|
|
|
|
OVERLAP = 512 # 帧移(与测试音频代码一致)
|
|
|
|
|
STEP = WIN_SIZE - OVERLAP # 帧步长(与测试音频代码一致)
|
|
|
|
|
THRESH = 0.01 # 能量阈值(降低以确保检测到敲击片段,与测试音频代码一致)
|
|
|
|
|
SEG_LEN_SEC = 0.2 # 每段音频长度(秒,与测试音频代码一致)
|
|
|
|
|
# 标签映射(按深度学习习惯定义:“空”标注为0,后续可根据需求修改;若“实”则改为1)
|
|
|
|
|
LABEL_MAP = {"空": 0, "实": 1}
|
|
|
|
|
# 输出文件路径(默认保存在音频同目录,文件名含“学习”标识,便于区分)
|
|
|
|
|
OUT_MAT = Path(WAV_FILE).parent / f"{Path(WAV_FILE).stem}_train_features.mat"
|
|
|
|
|
OUT_PKL = Path(WAV_FILE).parent / f"{Path(WAV_FILE).stem}_train_features.pkl"
|
|
|
|
|
|
|
|
|
|
# ---------- 工具函数(完全复用之前的逻辑,确保特征一致性) ----------
|
|
|
|
|
def segment_signal(signal: np.ndarray, fs: int):
|
|
|
|
|
"""按能量切分敲击片段(与测试音频代码完全一致)"""
|
|
|
|
|
if signal.ndim > 1: # 双声道自动转单声道
|
|
|
|
|
signal = signal[:, 0]
|
|
|
|
|
signal = signal / (np.max(np.abs(signal)) + 1e-12) # 音频归一化(避免幅值影响)
|
|
|
|
|
|
|
|
|
|
# 分帧并计算每帧能量
|
|
|
|
|
frames = librosa.util.frame(signal, frame_length=WIN_SIZE, hop_length=STEP).T
|
|
|
|
|
energy = np.sum(frames ** 2, axis=1)
|
|
|
|
|
|
|
|
|
|
# 筛选能量高于阈值的帧,定位“新敲击”起始点
|
|
|
|
|
idx = np.where(energy > THRESH)[0]
|
|
|
|
|
if idx.size == 0:
|
|
|
|
|
return []
|
|
|
|
|
# 相邻有效帧间隔>5帧时,视为新的敲击(避免连续帧重复计数)
|
|
|
|
|
hit_mask = np.diff(np.concatenate(([0], idx))) > 5
|
|
|
|
|
hit_starts = idx[hit_mask]
|
|
|
|
|
|
|
|
|
|
# 切分固定长度的片段(不足长度时取到音频末尾)
|
|
|
|
|
seg_len = int(round(SEG_LEN_SEC * fs))
|
|
|
|
|
segments = []
|
|
|
|
|
for start_frame in hit_starts:
|
|
|
|
|
start_sample = start_frame * STEP
|
|
|
|
|
end_sample = min(start_sample + seg_len, len(signal))
|
|
|
|
|
segments.append(signal[start_sample:end_sample])
|
|
|
|
|
return segments
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_features(sig: np.ndarray, fs: int):
|
|
|
|
|
"""提取五维特征(与测试音频代码完全一致,保证深度学习数据匹配)"""
|
|
|
|
|
sig = sig.flatten()
|
|
|
|
|
if sig.size == 0: # 空片段防报错,返回0向量
|
|
|
|
|
return np.zeros(5, dtype=np.float32)
|
|
|
|
|
|
|
|
|
|
# 1. RMS(均方根:反映音频能量大小)
|
|
|
|
|
rms = np.sqrt(np.mean(sig ** 2))
|
|
|
|
|
# 2. 主频(频谱峰值对应的频率:反映敲击声的主要频率成分)
|
|
|
|
|
L = sig.size
|
|
|
|
|
freq = np.fft.rfftfreq(L, d=1 / fs) # 频率轴
|
|
|
|
|
fft_mag = np.abs(np.fft.rfft(sig)) # 频谱幅值
|
|
|
|
|
main_freq = freq[np.argmax(fft_mag)]
|
|
|
|
|
# 3. 频谱偏度(反映频谱分布的不对称性:区分“空”“实”的关键特征之一)
|
|
|
|
|
spec_power = fft_mag
|
|
|
|
|
freq_centroid = np.sum(freq * spec_power) / (np.sum(spec_power) + 1e-12) # 频谱质心
|
|
|
|
|
freq_spread = np.sqrt(np.sum(((freq - freq_centroid) ** 2) * spec_power) / (np.sum(spec_power) + 1e-12)) # 频谱展宽
|
|
|
|
|
skewness = np.sum(((freq - freq_centroid) ** 3) * spec_power) / ((np.sum(spec_power) + 1e-12) * (freq_spread ** 3 + 1e-12))
|
|
|
|
|
# 4. MFCC第一维均值(梅尔频率倒谱系数:反映音频的音色特征)
|
|
|
|
|
try:
|
|
|
|
|
mfcc = librosa.feature.mfcc(y=sig, sr=fs, n_mfcc=13) # 提取13维MFCC
|
|
|
|
|
mfcc_mean = float(np.mean(mfcc[0, :])) # 取第一维均值(最能区分音色)
|
|
|
|
|
except Exception: # 异常情况(如片段过短)返回0
|
|
|
|
|
mfcc_mean = 0.0
|
|
|
|
|
# 5. 包络峰值(希尔伯特变换提取幅度包络:反映敲击声的衰减特性)
|
|
|
|
|
amp_envelope = np.abs(hilbert(sig))
|
|
|
|
|
env_peak = np.max(amp_envelope)
|
|
|
|
|
|
|
|
|
|
# 特征格式统一为float32(适配深度学习框架)
|
|
|
|
|
return np.array([rms, main_freq, skewness, mfcc_mean, env_peak], dtype=np.float32)
|
|
|
|
|
|
|
|
|
|
# ---------- 主程序(核心:自动识别标签+特征+标签保存) ----------
|
|
|
|
|
def main():
|
|
|
|
|
# 1. 检查音频文件是否存在
|
|
|
|
|
wav_path = Path(WAV_FILE)
|
|
|
|
|
if not wav_path.exists():
|
|
|
|
|
print(f"❌ 错误:音频文件 {WAV_FILE} 不存在!")
|
|
|
|
|
return
|
|
|
|
|
if wav_path.suffix != ".wav":
|
|
|
|
|
print(f"❌ 错误:{wav_path.name} 不是WAV格式!")
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
# 2. 读取音频(用librosa兼容更多格式,保持采样率不变)
|
|
|
|
|
audio, sr = librosa.load(wav_path, sr=None, mono=True)
|
|
|
|
|
print(f"✅ 成功读取学习音频:{wav_path.name}")
|
|
|
|
|
print(f" 采样率:{sr} Hz | 音频长度:{len(audio)/sr:.2f} 秒")
|
|
|
|
|
|
|
|
|
|
# 3. 切分有效敲击片段
|
|
|
|
|
segments = segment_signal(audio, sr)
|
|
|
|
|
if len(segments) == 0:
|
|
|
|
|
print(f"⚠️ 未检测到有效敲击片段!可尝试降低THRESH(当前{THRESH})或检查音频是否有敲击声。")
|
|
|
|
|
return
|
|
|
|
|
print(f"✅ 检测到 {len(segments)} 个有效敲击片段")
|
|
|
|
|
|
|
|
|
|
# 4. 提取特征+自动识别标签
|
|
|
|
|
features_list = []
|
|
|
|
|
labels_list = []
|
|
|
|
|
# 从文件名提取标注(“瓷空1.wav”含“空”,对应标签0)
|
|
|
|
|
file_stem = wav_path.stem # 文件名(不含后缀):"瓷空1"
|
|
|
|
|
if "空" in file_stem:
|
|
|
|
|
label = LABEL_MAP["空"]
|
|
|
|
|
print(f"✅ 自动识别标注:{file_stem} → 标签 {label}(空)")
|
|
|
|
|
elif "实" in file_stem:
|
|
|
|
|
label = LABEL_MAP["实"]
|
|
|
|
|
print(f"✅ 自动识别标注:{file_stem} → 标签 {label}(实)")
|
|
|
|
|
else:
|
|
|
|
|
print(f"⚠️ 文件名 {file_stem} 不含'空'或'实',手动指定标签为0(空)!")
|
|
|
|
|
label = LABEL_MAP["空"] # 手动兜底,可根据实际修改
|
|
|
|
|
|
|
|
|
|
# 批量提取特征并匹配标签(每个片段对应一个标签)
|
|
|
|
|
for i, seg in enumerate(segments, 1):
|
|
|
|
|
feat = extract_features(seg, sr)
|
|
|
|
|
features_list.append(feat)
|
|
|
|
|
labels_list.append(label)
|
|
|
|
|
print(f" 片段{i:02d}:特征提取完成(维度:5)")
|
|
|
|
|
|
|
|
|
|
# 5. 整理为矩阵格式(适配深度学习输入)
|
|
|
|
|
features_matrix = np.vstack(features_list) # 特征矩阵:(片段数, 5)
|
|
|
|
|
labels_array = np.array(labels_list, dtype=np.int8).reshape(-1, 1) # 标签矩阵:(片段数, 1)
|
|
|
|
|
print(f"\n✅ 特征与标签整理完成")
|
|
|
|
|
print(f" 特征矩阵形状:{features_matrix.shape}(行=片段数,列=5维特征)")
|
|
|
|
|
print(f" 标签矩阵形状:{labels_array.shape}(行=片段数,列=1)")
|
|
|
|
|
|
|
|
|
|
# 6. 保存为MAT文件(兼容MATLAB深度学习框架)
|
|
|
|
|
savemat(OUT_MAT, {
|
|
|
|
|
"matrix": features_matrix, # 特征矩阵(与之前训练集格式一致)
|
|
|
|
|
"label": labels_array # 标签矩阵(与之前训练集格式一致)
|
|
|
|
|
})
|
|
|
|
|
print(f"✅ MAT文件已保存:{OUT_MAT}")
|
|
|
|
|
|
|
|
|
|
# 7. 保存为PKL文件(兼容Python深度学习框架,如PyTorch/TensorFlow)
|
|
|
|
|
with open(OUT_PKL, "wb") as f:
|
|
|
|
|
pickle.dump({
|
|
|
|
|
"matrix": features_matrix, # 特征矩阵
|
|
|
|
|
"label": labels_array # 标签矩阵(含标注信息)
|
|
|
|
|
}, f)
|
|
|
|
|
print(f"✅ PKL文件已保存:{OUT_PKL}")
|
|
|
|
|
|
|
|
|
|
# 8. 特征可视化(可选,帮助直观查看特征分布)
|
|
|
|
|
plt.figure(figsize=(12, 8))
|
|
|
|
|
feature_names = ["RMS(能量)", "主频(Hz)", "频谱偏度", "MFCC均值", "包络峰值"]
|
|
|
|
|
for i in range(5):
|
|
|
|
|
plt.subplot(2, 3, i+1)
|
|
|
|
|
plt.plot(range(1, len(features_matrix)+1), features_matrix[:, i], "-o", color="#1f77b4", linewidth=1.5, markersize=4)
|
|
|
|
|
plt.xlabel("片段编号", fontsize=10)
|
|
|
|
|
plt.ylabel("特征值", fontsize=10)
|
|
|
|
|
plt.title(f"特征{i+1}:{feature_names[i]}", fontsize=11, fontweight="bold")
|
|
|
|
|
plt.grid(True, alpha=0.3)
|
|
|
|
|
# 标签信息标注
|
|
|
|
|
plt.subplot(2, 3, 6)
|
|
|
|
|
plt.text(0.5, 0.6, f"音频文件:{wav_path.name}", ha="center", fontsize=11)
|
|
|
|
|
plt.text(0.5, 0.4, f"标注标签:{label}({'空' if label==0 else '实'})", ha="center", fontsize=11)
|
|
|
|
|
plt.text(0.5, 0.2, f"有效片段数:{len(features_matrix)}", ha="center", fontsize=11)
|
|
|
|
|
plt.axis("off")
|
|
|
|
|
plt.tight_layout()
|
|
|
|
|
plt.show()
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
main()
|