You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

161 lines
5.5 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

from calendar import isleap
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import GetData
import datetime as DT
'''
处理预测数据
'''
# 空气质量数据处理:只获取其中的数据
def setAir(week_data):
airs = []
for i in week_data['空气质量指数']:
i = i.split(' ')[0]
airs.append(int(i))
return airs
# 气温数据处理:去掉数据的单位°并把数据变为整形
def setHighTemp(week_data):
temperature = []
for i in week_data['最高温']:
i = i.split('°')[0]
temperature.append(int(i))
return temperature
def setLowTemp(week_data):
temperature = []
for i in week_data['最低温']:
i = i.split('°')[0]
temperature.append(int(i))
return temperature
# 处理天气数据,为天气状态编码
def setCondition(week_data):
# 天气状况编码
flag = []
for StringData in week_data['最低温']:
if '' in str(StringData):
flag.append(1)
elif '多云' in str(StringData):
flag.append(2)
elif '' in str(StringData):
flag.append(3)
elif '' in str(StringData):
flag.append(4)
elif '' in str(StringData):
flag.append(5)
elif '' in str(StringData) or '' in str(StringData):
flag.append(6)
elif '扬沙' in str(StringData):
flag.append(7)
else:
flag.append(-1)
return flag
def process(date):
date['最高温']=setHighTemp(date)
date['最低温']=setLowTemp(date)
date['空气质量指数']=setAir(date)
date1=date.drop('天气', axis=1)
date2=date1.drop('风力风向',axis=1)
return date2
def write(years, b,c):
"""
:param years: [开始日期距离现在的年份]
:param b: [开始日期距离现在日期的天数, 结束日期距离现在日期的天数]
:param c: csv文件名
:return: None
"""
# 取现在日期
today = DT.datetime.today()
# 闰年片段
st = isleap(today.year)
# 取20天前日期
week_ago = (today - DT.timedelta(days=b[0])).date()
# 20天后
week_pre = (today + DT.timedelta(days=b[1])).date()
if week_ago.month + week_pre.month == 3 or week_ago.month + week_pre.month == 5:
if week_ago.month == 2 and not st == isleap(today.year - years[0]):
if st:
# 今年是,去年或未来不是,所以-1
week_ago -= DT.timedelta(days=1)
else:
# 今年不是,去年或未来是,所以+1
week_ago += DT.timedelta(days=1)
if week_pre.month == 2 and not st == isleap(today.year - years[1]):
if st:
# 今年是,去年或未来不是,所以要-1
week_pre -= DT.timedelta(days=1)
else:
# 今年不是,去年或未来是,所以+1
week_pre += DT.timedelta(days=1)
#print(week_ago.year-years[0],week_ago.month,week_ago.day)
#print(week_pre.year-years[1],week_pre.month,week_pre.day)
# 爬取数据
id =54161
# 取到预处理后的用来预测的数据
date0 = GetData.getPredictDate(week_ago.year-years[0],week_ago.month,week_ago.day,week_pre.year-years[1],week_pre.month,week_pre.day)
date_=process(date0).set_index("日期")
date_.to_csv(c)
# 功能: 对用来预测的数据进行预处理
def ProcessData():
"""
X_trainy_train是原始的数据集。X_train,y_train 是原始数据集划分出来作为训练模型的fit模型的时候用。
X_test,y_test 这部分的数据不参与模型的训练而是用于评价训练出来的模型好坏score评分的时候用。
:return:
[X_train X训练数据集,
X_valid X训练数据集的验证集,
y_train Y训练数据集,
y_valid Y训练数据集的验证集,
imputed_X_test 预测数据集]
"""
# 写入csv
write([1,1], [14, 0], "date_train.csv")
write([1,1], [0, 14], "date_valid.csv")
write([0,0], [14, 0], "date_test.csv")
X_test = pd.read_csv("date_test.csv", index_col="日期", parse_dates=True)
# 读取测试集和验证集
X = pd.read_csv("date_train.csv", index_col="日期", parse_dates=True)
y = pd.read_csv("date_valid.csv", index_col="日期", parse_dates=True)
my_imputer = SimpleImputer()
# train_test_split()是sklearn包的model_selection模块中提供的随机划分训练集和测试集的函数
# 使用train_test_split函数可以将原始数据集按照一定比例划分训练集和测试集对模型进行训练
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid))
imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns
imputed_y_train = pd.DataFrame(my_imputer.fit_transform(y_train))
imputed_y_valid = pd.DataFrame(my_imputer.transform(y_valid))
imputed_y_train.columns = y_train.columns
imputed_y_valid.columns = y_valid.columns
imputed_X_test = pd.DataFrame(my_imputer.fit_transform(X_test))
# 画折线图
'''
sns.lineplot(data=X)
plt.show()
sns.lineplot(data=y)
plt.show()
sns.lineplot(data=X_test)
plt.show()
'''
# 返回分割后的数据集
return [imputed_X_train, imputed_X_valid, imputed_y_train, imputed_y_valid, imputed_X_test]