You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

72 lines
3.6 KiB

5 months ago
import numpy as np
import pandas as pd
from pylab import mpl
mpl.rcParams["font.sans-serif"] = ["SimHei"]
mpl.rcParams["axes.unicode_minus"] = False
df_baidu = pd.read_csv('../data/baidu_index.csv',encoding = 'utf-8')# 百度流感指数
df_beijing = pd.read_csv('../data/beijin_zhoubao.csv',encoding = 'utf-8')# 北京传染病周报
df_liugan = pd.read_csv('../data/liugan_zhoubao.csv',encoding = 'utf-8')# 流感周报
df_hx = pd.read_csv('../data/hx_jijin_data.csv',encoding = 'utf-8')# 流感基金——华商医药医疗行业
df_gf = pd.read_csv('../data/gf_jijin_data.csv',encoding = 'utf-8')# 流感基金——广发创新医疗两年持有混合
# 确保日期列是日期类型
df_baidu['date'] = pd.to_datetime(df_baidu['date'])
df_beijing['date'] = pd.to_datetime(df_beijing['date'])
df_liugan['date'] = pd.to_datetime(df_liugan['date'])
df_hx['date'] = pd.to_datetime(df_hx['date'])
df_gf['date'] = pd.to_datetime(df_gf['date'])
df1 = df_baidu
df2 = df_beijing
df3 = df_liugan
df4 = df_hx
df5 = df_gf
# 创建一个完整的日期范围
all_dates = pd.date_range(start=min(df1['date'].min(), df2['date'].min(), df3['date'].min()),
end=max(df1['date'].max(), df2['date'].max(), df3['date'].max()))
# 重新索引每个DataFrame以包括所有日期
df1 = df1.set_index('date').reindex(all_dates).ffill().reset_index().rename(columns={'index': 'date'})
df2 = df2.set_index('date').reindex(all_dates).ffill().reset_index().rename(columns={'index': 'date'})
df3 = df3.set_index('date').reindex(all_dates).ffill().reset_index().rename(columns={'index': 'date'})
df4 = df4.set_index('date').reindex(all_dates).ffill().reset_index().rename(columns={'index': 'date'})
df5 = df5.set_index('date').reindex(all_dates).ffill().reset_index().rename(columns={'index': 'date'})
df1.drop(columns=['Unnamed: 0'], inplace=True)
df2.drop(columns=['Unnamed: 0'], inplace=True)
df3.drop(columns=['Unnamed: 0'], inplace=True)
df4.drop(columns=['Unnamed: 0'], inplace=True)
df5.drop(columns=['Unnamed: 0'], inplace=True)
# 合并数据集
df_merged = df1.merge(df2, on='date', how='outer').merge(df3, on='date', how='outer').merge(df4, on='date', how='outer').merge(df5, on='date', how='outer')
df_merged = df_merged[['date', 'liugan_index', 'beijing_number', 'infection_number','hx_jijin_data','gf_jijin_data']]
# 输出合并后的DataFrame
# print(df_merged.head(20))
#缺失值处理
# df = df_merged.dropna(how= 'any')
# 确保'date'列是日期格式,并设置为索引
df_merged['date'] = pd.to_datetime(df_merged['date'])
df_merged.set_index('date', inplace=True)
# 只对非日期列转换数据类型
numerical_columns = df_merged.columns.difference(['date']) # 排除'date'列
df_merged[numerical_columns] = df_merged[numerical_columns].astype(float)
# 确保数据类型正确并查找是否有NaN或inf值
df_merged = df_merged.astype(float)
print("Initial NaN or Inf check:", df_merged.isin([np.inf, -np.inf]).sum(), df_merged.isna().sum())
# 处理NaN值和无穷大值
df_merged.replace([np.inf, -np.inf], np.nan, inplace=True)
df_merged.ffill() # 使用前向填充处理NaN值
df_merged.dropna(inplace=True) # 如果有剩余的NaN值删除这些行
df_merged.to_csv('../data/merged_data.csv',encoding='utf-8')
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
# 选择需要归一化的列
columns_to_scale = ['liugan_index', 'beijing_number', 'infection_number','hx_jijin_data','gf_jijin_data']
# 对选定的列进行归一化处理
df_merged[columns_to_scale] = scaler.fit_transform(df_merged[columns_to_scale])
# 查看归一化后的数据
print(df_merged.head())
df_merged.to_csv('../data/merged_data.csv',index=True,encoding = 'utf-8')