You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.
import pandas as pd
import numpy as np
from sklearn . preprocessing import MinMaxScaler , LabelEncoder
import config
def load_and_preprocess_data ( ) :
print ( " Loading data... " )
# 1. 加载数据
df = pd . read_csv ( config . DATA_PATH )
# 2. 论文要求:剔除含 NaN 的行
df = df . dropna ( axis = 0 )
# 3. 【关键修改】根据你 CSV 的实际列名填写
# 基础特征:直接从 CSV 里读取
feature_cols = [
" lat " , " lon " , " baroaltitude " , " geoaltitude " ,
" velocity " , " heading "
]
# 4. 编码 ICAO24 (虽然是十六进制字符串,但我们可以把它当类别处理)
le_icao = LabelEncoder ( )
df [ " icao24_enc " ] = le_icao . fit_transform ( df [ " icao24 " ] )
feature_cols . append ( " icao24_enc " )
# 5. 【核心】生成论文提到的衍生特征
# 先按飞机和时间排序,保证计算的准确性
df = df . sort_values ( [ " icao24 " , " time " ] ) . reset_index ( drop = True )
# 衍生特征 1: 加速度 (速度的变化率)
# 对同一架飞机,计算当前速度与上一时刻速度的差值
df [ ' acceleration ' ] = df . groupby ( ' icao24 ' ) [ ' velocity ' ] . diff ( ) . fillna ( 0 )
feature_cols . append ( " acceleration " )
# 标签列
label_col = " label "
# 6. 论文要求: Min-Max 归一化
scaler = MinMaxScaler ( )
df [ feature_cols ] = scaler . fit_transform ( df [ feature_cols ] )
# 7. 按时间排序(航空数据禁止随机打乱)
df = df . sort_values ( " time " ) . reset_index ( drop = True )
# 8. 增加时间窗口列,用于后续构图
df [ " time_window " ] = ( df [ " time " ] / / config . TIME_WINDOW ) . astype ( int )
print ( f " Data loaded successfully! " )
print ( f " Total samples: { len ( df ) } " )
print ( f " Features used: { feature_cols } " )
print ( f " Label distribution: \n { df [ label_col ] . value_counts ( ) } " )
return df , feature_cols , label_col