parent
f62eb3b8da
commit
1722592fe4
@ -0,0 +1,58 @@
|
||||
import pandas as pd
|
||||
from data_request import get_data_from_mysql
|
||||
|
||||
# 将每一类的语音数据拼成一整段
|
||||
def aggregate_texts(group):
|
||||
text = ''
|
||||
for _, row in group.iterrows():
|
||||
key = row['type'].split('_')[-1] # 提取 AT/PI/PIAT
|
||||
words = row['text']
|
||||
text += f"{key}: {words}\n"
|
||||
return text
|
||||
|
||||
# 构建新的type列
|
||||
def construct_new_type(group):
|
||||
start_time = group['type'].apply(lambda x: x.split('_')[2]).min()
|
||||
end_time = group['type'].apply(lambda x: x.split('_')[3]).max()
|
||||
return f"{group['category'].iloc[0]}_{start_time}_{end_time}"
|
||||
|
||||
# 整合表格中一段语音数据
|
||||
def reshape_data(df):
|
||||
rows = []
|
||||
# 提取类别信息并聚合
|
||||
df['category'] = df['type'].apply(lambda x: '_'.join(x.split('_')[:2])) # 提取 uwb-atcc_ACCU-0agmXf
|
||||
grouped = df.groupby('category')
|
||||
for _, group in grouped:
|
||||
row = {}
|
||||
row['type'] = construct_new_type(group)
|
||||
row['text'] = aggregate_texts(group)
|
||||
row['segment_start_time'] = group['segment_start_time'].min()
|
||||
row['segment_end_time'] = group['segment_end_time'].max()
|
||||
row['duration'] = row['segment_end_time'] - row['segment_start_time']
|
||||
rows.append(row)
|
||||
new_df = pd.DataFrame(rows)
|
||||
return new_df
|
||||
|
||||
# 识别飞机呼号
|
||||
def recognize_callsign(df):
|
||||
return df
|
||||
|
||||
# 总结该呼号的相关信息
|
||||
def summarize_callsign(df):
|
||||
return df
|
||||
|
||||
def wash_pipeline(df):
|
||||
# 文本数据整合
|
||||
df = reshape_data(df)
|
||||
# 识别飞机呼号
|
||||
df = recognize_callsign(df)
|
||||
# 总结该呼号的相关信息
|
||||
df = summarize_callsign(df)
|
||||
return df
|
||||
|
||||
if __name__ == "__main__":
|
||||
df = get_data_from_mysql('root', '1234', 'atc', 'origin_table')
|
||||
# 只取前1000条数据
|
||||
df = df.head(1000)
|
||||
df = wash_pipeline(df)
|
||||
df.to_csv('..\\data\\wash_result.csv', index=False)
|
@ -0,0 +1,9 @@
|
||||
#从mysql数据库中获取数据
|
||||
import pandas as pd
|
||||
from sqlalchemy import create_engine
|
||||
|
||||
def get_data_from_mysql(user, password, database, table):
|
||||
engine = create_engine(f'mysql+pymysql://{user}:{password}@localhost:3306/{database}')
|
||||
|
||||
df = pd.read_sql(f"SELECT * FROM {table}", con=engine)
|
||||
return df
|
@ -0,0 +1,14 @@
|
||||
import pandas as pd
|
||||
from sqlalchemy import create_engine, text
|
||||
|
||||
def save_data_to_mysql(df, user, password, database, table):
|
||||
engine = create_engine(f'mysql+pymysql://{user}:{password}@localhost:3306/{database}')
|
||||
|
||||
df.to_sql(f'{table}', con=engine, if_exists='append', index=False)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# 读取并保存数据
|
||||
df = pd.read_csv('..\\data\\train2.csv').rename(columns={'id': 'type'})
|
||||
save_data_to_mysql(df, 'root', '1234', 'atc', 'origin_table')
|
||||
print("数据导入完成,共插入{}条记录".format(len(df)))
|
@ -0,0 +1,10 @@
|
||||
import pandas as pd
|
||||
from data_request import get_data_from_mysql
|
||||
from data_prewashing import wash_pipeline
|
||||
if __name__ == "__main__":
|
||||
# 从mysql数据库中获取数据
|
||||
df = get_data_from_mysql('root', '1234', 'atc', 'origin_table')
|
||||
df.drop(columns=['id'], inplace=True)
|
||||
# 对数据进行预处理
|
||||
df = wash_pipeline(df)
|
||||
|
Loading…
Reference in new issue