You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

72 lines
3.6 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import numpy as np
import pandas as pd
from pylab import mpl
mpl.rcParams["font.sans-serif"] = ["SimHei"]
mpl.rcParams["axes.unicode_minus"] = False
df_baidu = pd.read_csv('../data/baidu_index.csv',encoding = 'utf-8')# 百度流感指数
df_beijing = pd.read_csv('../data/beijin_zhoubao.csv',encoding = 'utf-8')# 北京传染病周报
df_liugan = pd.read_csv('../data/liugan_zhoubao.csv',encoding = 'utf-8')# 流感周报
df_hx = pd.read_csv('../data/hx_jijin_data.csv',encoding = 'utf-8')# 流感基金——华商医药医疗行业
df_gf = pd.read_csv('../data/gf_jijin_data.csv',encoding = 'utf-8')# 流感基金——广发创新医疗两年持有混合
# 确保日期列是日期类型
df_baidu['date'] = pd.to_datetime(df_baidu['date'])
df_beijing['date'] = pd.to_datetime(df_beijing['date'])
df_liugan['date'] = pd.to_datetime(df_liugan['date'])
df_hx['date'] = pd.to_datetime(df_hx['date'])
df_gf['date'] = pd.to_datetime(df_gf['date'])
df1 = df_baidu
df2 = df_beijing
df3 = df_liugan
df4 = df_hx
df5 = df_gf
# 创建一个完整的日期范围
all_dates = pd.date_range(start=min(df1['date'].min(), df2['date'].min(), df3['date'].min()),
end=max(df1['date'].max(), df2['date'].max(), df3['date'].max()))
# 重新索引每个DataFrame以包括所有日期
df1 = df1.set_index('date').reindex(all_dates).ffill().reset_index().rename(columns={'index': 'date'})
df2 = df2.set_index('date').reindex(all_dates).ffill().reset_index().rename(columns={'index': 'date'})
df3 = df3.set_index('date').reindex(all_dates).ffill().reset_index().rename(columns={'index': 'date'})
df4 = df4.set_index('date').reindex(all_dates).ffill().reset_index().rename(columns={'index': 'date'})
df5 = df5.set_index('date').reindex(all_dates).ffill().reset_index().rename(columns={'index': 'date'})
df1.drop(columns=['Unnamed: 0'], inplace=True)
df2.drop(columns=['Unnamed: 0'], inplace=True)
df3.drop(columns=['Unnamed: 0'], inplace=True)
df4.drop(columns=['Unnamed: 0'], inplace=True)
df5.drop(columns=['Unnamed: 0'], inplace=True)
# 合并数据集
df_merged = df1.merge(df2, on='date', how='outer').merge(df3, on='date', how='outer').merge(df4, on='date', how='outer').merge(df5, on='date', how='outer')
df_merged = df_merged[['date', 'liugan_index', 'beijing_number', 'infection_number','hx_jijin_data','gf_jijin_data']]
# 输出合并后的DataFrame
# print(df_merged.head(20))
#缺失值处理
# df = df_merged.dropna(how= 'any')
# 确保'date'列是日期格式,并设置为索引
df_merged['date'] = pd.to_datetime(df_merged['date'])
df_merged.set_index('date', inplace=True)
# 只对非日期列转换数据类型
numerical_columns = df_merged.columns.difference(['date']) # 排除'date'列
df_merged[numerical_columns] = df_merged[numerical_columns].astype(float)
# 确保数据类型正确并查找是否有NaN或inf值
df_merged = df_merged.astype(float)
print("Initial NaN or Inf check:", df_merged.isin([np.inf, -np.inf]).sum(), df_merged.isna().sum())
# 处理NaN值和无穷大值
df_merged.replace([np.inf, -np.inf], np.nan, inplace=True)
df_merged.ffill() # 使用前向填充处理NaN值
df_merged.dropna(inplace=True) # 如果有剩余的NaN值删除这些行
df_merged.to_csv('../data/merged_data.csv',encoding='utf-8')
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
# 选择需要归一化的列
columns_to_scale = ['liugan_index', 'beijing_number', 'infection_number','hx_jijin_data','gf_jijin_data']
# 对选定的列进行归一化处理
df_merged[columns_to_scale] = scaler.fit_transform(df_merged[columns_to_scale])
# 查看归一化后的数据
print(df_merged.head())
df_merged.to_csv('../data/merged_data.csv',index=True,encoding = 'utf-8')