|
|
import numpy as np
|
|
|
import pandas as pd
|
|
|
from pylab import mpl
|
|
|
|
|
|
mpl.rcParams["font.sans-serif"] = ["SimHei"]
|
|
|
mpl.rcParams["axes.unicode_minus"] = False
|
|
|
|
|
|
df_baidu = pd.read_csv('../data/baidu_index.csv',encoding = 'utf-8')# 百度流感指数
|
|
|
df_beijing = pd.read_csv('../data/beijin_zhoubao.csv',encoding = 'utf-8')# 北京传染病周报
|
|
|
df_liugan = pd.read_csv('../data/liugan_zhoubao.csv',encoding = 'utf-8')# 流感周报
|
|
|
df_hx = pd.read_csv('../data/hx_jijin_data.csv',encoding = 'utf-8')# 流感基金——华商医药医疗行业
|
|
|
df_gf = pd.read_csv('../data/gf_jijin_data.csv',encoding = 'utf-8')# 流感基金——广发创新医疗两年持有混合
|
|
|
# 确保日期列是日期类型
|
|
|
df_baidu['date'] = pd.to_datetime(df_baidu['date'])
|
|
|
df_beijing['date'] = pd.to_datetime(df_beijing['date'])
|
|
|
df_liugan['date'] = pd.to_datetime(df_liugan['date'])
|
|
|
df_hx['date'] = pd.to_datetime(df_hx['date'])
|
|
|
df_gf['date'] = pd.to_datetime(df_gf['date'])
|
|
|
df1 = df_baidu
|
|
|
df2 = df_beijing
|
|
|
df3 = df_liugan
|
|
|
df4 = df_hx
|
|
|
df5 = df_gf
|
|
|
# 创建一个完整的日期范围
|
|
|
all_dates = pd.date_range(start=min(df1['date'].min(), df2['date'].min(), df3['date'].min()),
|
|
|
end=max(df1['date'].max(), df2['date'].max(), df3['date'].max()))
|
|
|
# 重新索引每个DataFrame以包括所有日期
|
|
|
df1 = df1.set_index('date').reindex(all_dates).ffill().reset_index().rename(columns={'index': 'date'})
|
|
|
df2 = df2.set_index('date').reindex(all_dates).ffill().reset_index().rename(columns={'index': 'date'})
|
|
|
df3 = df3.set_index('date').reindex(all_dates).ffill().reset_index().rename(columns={'index': 'date'})
|
|
|
df4 = df4.set_index('date').reindex(all_dates).ffill().reset_index().rename(columns={'index': 'date'})
|
|
|
df5 = df5.set_index('date').reindex(all_dates).ffill().reset_index().rename(columns={'index': 'date'})
|
|
|
df1.drop(columns=['Unnamed: 0'], inplace=True)
|
|
|
df2.drop(columns=['Unnamed: 0'], inplace=True)
|
|
|
df3.drop(columns=['Unnamed: 0'], inplace=True)
|
|
|
df4.drop(columns=['Unnamed: 0'], inplace=True)
|
|
|
df5.drop(columns=['Unnamed: 0'], inplace=True)
|
|
|
# 合并数据集
|
|
|
df_merged = df1.merge(df2, on='date', how='outer').merge(df3, on='date', how='outer').merge(df4, on='date', how='outer').merge(df5, on='date', how='outer')
|
|
|
df_merged = df_merged[['date', 'liugan_index', 'beijing_number', 'infection_number','hx_jijin_data','gf_jijin_data']]
|
|
|
|
|
|
# 输出合并后的DataFrame
|
|
|
# print(df_merged.head(20))
|
|
|
#缺失值处理
|
|
|
# df = df_merged.dropna(how= 'any')
|
|
|
# 确保'date'列是日期格式,并设置为索引
|
|
|
df_merged['date'] = pd.to_datetime(df_merged['date'])
|
|
|
df_merged.set_index('date', inplace=True)
|
|
|
|
|
|
# 只对非日期列转换数据类型
|
|
|
numerical_columns = df_merged.columns.difference(['date']) # 排除'date'列
|
|
|
df_merged[numerical_columns] = df_merged[numerical_columns].astype(float)
|
|
|
|
|
|
# 确保数据类型正确,并查找是否有NaN或inf值
|
|
|
df_merged = df_merged.astype(float)
|
|
|
print("Initial NaN or Inf check:", df_merged.isin([np.inf, -np.inf]).sum(), df_merged.isna().sum())
|
|
|
|
|
|
# 处理NaN值和无穷大值
|
|
|
df_merged.replace([np.inf, -np.inf], np.nan, inplace=True)
|
|
|
df_merged.ffill() # 使用前向填充处理NaN值
|
|
|
df_merged.dropna(inplace=True) # 如果有剩余的NaN值,删除这些行
|
|
|
df_merged.to_csv('../data/merged_data.csv',encoding='utf-8')
|
|
|
from sklearn.preprocessing import MinMaxScaler
|
|
|
scaler = MinMaxScaler()
|
|
|
# 选择需要归一化的列
|
|
|
columns_to_scale = ['liugan_index', 'beijing_number', 'infection_number','hx_jijin_data','gf_jijin_data']
|
|
|
# 对选定的列进行归一化处理
|
|
|
df_merged[columns_to_scale] = scaler.fit_transform(df_merged[columns_to_scale])
|
|
|
# 查看归一化后的数据
|
|
|
print(df_merged.head())
|
|
|
df_merged.to_csv('../data/merged_data.csv',index=True,encoding = 'utf-8')
|