import numpy as np import pandas as pd from pylab import mpl mpl.rcParams["font.sans-serif"] = ["SimHei"] mpl.rcParams["axes.unicode_minus"] = False df_baidu = pd.read_csv('../data/baidu_index.csv',encoding = 'utf-8')# 百度流感指数 df_beijing = pd.read_csv('../data/beijin_zhoubao.csv',encoding = 'utf-8')# 北京传染病周报 df_liugan = pd.read_csv('../data/liugan_zhoubao.csv',encoding = 'utf-8')# 流感周报 df_hx = pd.read_csv('../data/hx_jijin_data.csv',encoding = 'utf-8')# 流感基金——华商医药医疗行业 df_gf = pd.read_csv('../data/gf_jijin_data.csv',encoding = 'utf-8')# 流感基金——广发创新医疗两年持有混合 # 确保日期列是日期类型 df_baidu['date'] = pd.to_datetime(df_baidu['date']) df_beijing['date'] = pd.to_datetime(df_beijing['date']) df_liugan['date'] = pd.to_datetime(df_liugan['date']) df_hx['date'] = pd.to_datetime(df_hx['date']) df_gf['date'] = pd.to_datetime(df_gf['date']) df1 = df_baidu df2 = df_beijing df3 = df_liugan df4 = df_hx df5 = df_gf # 创建一个完整的日期范围 all_dates = pd.date_range(start=min(df1['date'].min(), df2['date'].min(), df3['date'].min()), end=max(df1['date'].max(), df2['date'].max(), df3['date'].max())) # 重新索引每个DataFrame以包括所有日期 df1 = df1.set_index('date').reindex(all_dates).ffill().reset_index().rename(columns={'index': 'date'}) df2 = df2.set_index('date').reindex(all_dates).ffill().reset_index().rename(columns={'index': 'date'}) df3 = df3.set_index('date').reindex(all_dates).ffill().reset_index().rename(columns={'index': 'date'}) df4 = df4.set_index('date').reindex(all_dates).ffill().reset_index().rename(columns={'index': 'date'}) df5 = df5.set_index('date').reindex(all_dates).ffill().reset_index().rename(columns={'index': 'date'}) df1.drop(columns=['Unnamed: 0'], inplace=True) df2.drop(columns=['Unnamed: 0'], inplace=True) df3.drop(columns=['Unnamed: 0'], inplace=True) df4.drop(columns=['Unnamed: 0'], inplace=True) df5.drop(columns=['Unnamed: 0'], inplace=True) # 合并数据集 df_merged = df1.merge(df2, on='date', how='outer').merge(df3, on='date', how='outer').merge(df4, on='date', how='outer').merge(df5, on='date', how='outer') df_merged = df_merged[['date', 'liugan_index', 'beijing_number', 'infection_number','hx_jijin_data','gf_jijin_data']] # 输出合并后的DataFrame # print(df_merged.head(20)) #缺失值处理 # df = df_merged.dropna(how= 'any') # 确保'date'列是日期格式,并设置为索引 df_merged['date'] = pd.to_datetime(df_merged['date']) df_merged.set_index('date', inplace=True) # 只对非日期列转换数据类型 numerical_columns = df_merged.columns.difference(['date']) # 排除'date'列 df_merged[numerical_columns] = df_merged[numerical_columns].astype(float) # 确保数据类型正确,并查找是否有NaN或inf值 df_merged = df_merged.astype(float) print("Initial NaN or Inf check:", df_merged.isin([np.inf, -np.inf]).sum(), df_merged.isna().sum()) # 处理NaN值和无穷大值 df_merged.replace([np.inf, -np.inf], np.nan, inplace=True) df_merged.ffill() # 使用前向填充处理NaN值 df_merged.dropna(inplace=True) # 如果有剩余的NaN值,删除这些行 df_merged.to_csv('../data/merged_data.csv',encoding='utf-8') from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() # 选择需要归一化的列 columns_to_scale = ['liugan_index', 'beijing_number', 'infection_number','hx_jijin_data','gf_jijin_data'] # 对选定的列进行归一化处理 df_merged[columns_to_scale] = scaler.fit_transform(df_merged[columns_to_scale]) # 查看归一化后的数据 print(df_merged.head()) df_merged.to_csv('../data/merged_data.csv',index=True,encoding = 'utf-8')