|
|
# 代码9-6
|
|
|
#ARIMA模型
|
|
|
import pandas as pd
|
|
|
import matplotlib.pyplot as plt
|
|
|
import warnings
|
|
|
import math
|
|
|
import itertools
|
|
|
import numpy as np
|
|
|
import statsmodels.api as sm
|
|
|
from statsmodels.graphics.tsaplots import plot_acf
|
|
|
from statsmodels.stats.diagnostic import acorr_ljungbox
|
|
|
from statsmodels.tsa.stattools import adfuller as ADF
|
|
|
from statsmodels.tsa.arima_model import ARIMA
|
|
|
import matplotlib.ticker as ticker
|
|
|
|
|
|
|
|
|
on_h = pd.read_csv('../tmp/on_h.csv', index_col=0
|
|
|
,encoding='utf-8')
|
|
|
train = pd.DataFrame(on_h.iloc[0:426, 0])
|
|
|
test = pd.DataFrame(on_h.iloc[426:, 0])
|
|
|
#train.plot(title = '训练集时序图') # 画训练集时序图
|
|
|
x = train.index
|
|
|
y = train.on_man
|
|
|
fig, ax = plt.subplots(1,1)
|
|
|
ax.plot(x, y)
|
|
|
ticker_spacing = 70
|
|
|
ax.xaxis.set_major_locator(ticker.MultipleLocator(ticker_spacing))
|
|
|
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
|
|
|
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
|
|
|
plt.xticks(rotation = 45)
|
|
|
plt.xlabel('日期')
|
|
|
plt.ylabel('客流量')
|
|
|
plt.title('训练集时序图')
|
|
|
plt.tight_layout() # tight_layout()方法可以保证图像的完整度
|
|
|
plt.show()
|
|
|
|
|
|
plot_acf(train,lags=400)
|
|
|
plt.xlabel('日期索引')
|
|
|
plt.ylabel('自相关系数')
|
|
|
plt.title('训练集自相关图')
|
|
|
plt.tight_layout()
|
|
|
plt.show() # 画训练集自相关图
|
|
|
|
|
|
print('原始序列的ADF检验结果为:', ADF(train['on_man'])) # 检验训练集平稳性
|
|
|
print('白噪声检验结果为:', acorr_ljungbox(train['on_man'], lags=1))
|
|
|
|
|
|
|
|
|
|
|
|
# 代码9-7
|
|
|
import statsmodels.api as sm
|
|
|
train['on_man'] = train['on_man'].astype(float)
|
|
|
# 定阶
|
|
|
bic_matrix = [] # bic矩阵,选择bic矩阵中最小值对应的行(p),列(q)
|
|
|
# 存在部分报错,所以用try来跳过报错
|
|
|
for p in range(11):
|
|
|
tmp = []
|
|
|
for q in range(5):
|
|
|
try:
|
|
|
tmp.append(sm.tsa.ARIMA(train, order=(p, 1, q)).fit().bic)
|
|
|
except:
|
|
|
tmp.append(None)
|
|
|
bic_matrix.append(tmp)
|
|
|
bic_matrix = pd.DataFrame(bic_matrix) # 从中可以找出最小值
|
|
|
p, q = bic_matrix.stack().idxmin() # 先用stack展平,然后用idxmin找出最小值位置
|
|
|
print('BIC最小的p值和q值为:%s、%s' % (p, q))
|
|
|
|
|
|
model = sm.tsa.ARIMA(train, order=(p, 1, q)).fit() # 建立ARIMA(p, 1, 1)模型
|
|
|
summary = model.summary() # 给出一份模型报告
|
|
|
forecast = model.forecast(10)
|
|
|
print('10天的预测结果、标准误差和置信区间分别为:\n', forecast)
|
|
|
|
|
|
# pre = pd.DataFrame(forecast, columns = ['predict'])
|
|
|
pre = pd.DataFrame({'predict':forecast})
|
|
|
pre.index = test.index #如果索引不同将pre加到test中时会出错
|
|
|
test['pre'] = pre
|
|
|
plt.plot(test.index, test.on_man)
|
|
|
plt.plot(test.index, test.pre, linestyle=':')
|
|
|
plt.xticks(rotation = 45)
|
|
|
plt.legend(['真实', '预测'])
|
|
|
plt.title('预测结果和实际结果的对比')
|
|
|
plt.xlabel('日期')
|
|
|
plt.ylabel('上车人数')
|
|
|
plt.tight_layout() # tight_layout()方法可以保证图像的完整度
|
|
|
plt.show()
|
|
|
|
|
|
|
|
|
|
|
|
# 代码9-8
|
|
|
# 剔除节假日之后的时序图
|
|
|
on_nh = on_h[on_h.iloc[:,2] != '小长假'] # 剔除假期
|
|
|
x = on_nh.index
|
|
|
y = on_nh.on_man
|
|
|
fig, ax = plt.subplots(1,1)
|
|
|
ax.plot(x, y)
|
|
|
plt.xticks(range(len(x)), on_nh['date.1'])
|
|
|
ticker_spacing = 70
|
|
|
ax.xaxis.set_major_locator(ticker.MultipleLocator(ticker_spacing))
|
|
|
plt.xticks(rotation = 45)
|
|
|
plt.title('剔除节假日后ST111-01站点客流量')
|
|
|
plt.xlabel('日期')
|
|
|
plt.ylabel('客流量(人)')
|
|
|
plt.tight_layout() # tight_layout()方法可以保证图像的完整度
|
|
|
plt.show()
|
|
|
|
|
|
|
|
|
|
|
|
# 代码9-9
|
|
|
# 剔除节假日及前一天后的时序图
|
|
|
new_h = on_h #存放节假日更新的数据
|
|
|
new_h.index = range(len(new_h))
|
|
|
for i in range(1, len(new_h)):
|
|
|
if new_h.iloc[i, 2] == '小长假' :
|
|
|
new_h.iloc[i-1, 2] = '小长假'
|
|
|
new_nh = new_h[new_h.iloc[:,2] != '小长假'] # 剔除节假日及其前一天
|
|
|
|
|
|
plt.plot(new_nh['date.1'], new_nh['on_man'],color='green')
|
|
|
plt.gca().xaxis.set_major_locator(ticker.MultipleLocator(70))
|
|
|
plt.xticks(rotation = 45)
|
|
|
plt.title('剔除节假日及其前一天后ST111-01站点客流量')
|
|
|
plt.legend(['on_man'], loc=2)
|
|
|
plt.xlabel('日期')
|
|
|
plt.ylabel('客流量(人)')
|
|
|
plt.tight_layout() # tight_layout()方法可以保证图像的完整度
|
|
|
plt.savefig('../tmp/9-10.png', dpi=1080)
|
|
|
plt.show()
|
|
|
|
|
|
|
|
|
# 代码9-10
|
|
|
new_nh.index = range(len(new_nh))
|
|
|
train1 = pd.DataFrame(new_nh.iloc[0:378, 0])
|
|
|
test1 = pd.DataFrame(new_nh.iloc[378:, 0])
|
|
|
p = q = range(4)
|
|
|
d = range(2)
|
|
|
pdq = list(itertools.product(p, d, q))
|
|
|
seasonal_pdq = [(x[0], x[1], x[2], 7)for x in list(itertools.product(p, d, q))]
|
|
|
print('Examples of parameter combinations for Seasonal ARIMA...')
|
|
|
print('SARIMAX: {} x {}'.format(pdq[1], seasonal_pdq[1]))
|
|
|
print('SARIMAX: {} x {}'.format(pdq[1], seasonal_pdq[2]))
|
|
|
print('SARIMAX: {} x {}'.format(pdq[2], seasonal_pdq[3]))
|
|
|
print('SARIMAX: {} x {}'.format(pdq[2], seasonal_pdq[4]))
|
|
|
warnings.filterwarnings("ignore") # specify to ignore warning messages
|
|
|
sa =[]
|
|
|
for param in pdq:
|
|
|
for param_seasonal in seasonal_pdq:
|
|
|
try:
|
|
|
mod = sm.tsa.statespace.SARIMAX(train1,
|
|
|
order=param,
|
|
|
seasonal_order=param_seasonal,
|
|
|
enforce_stationarity=False,
|
|
|
enforce_invertibility=False)
|
|
|
results = mod.fit()
|
|
|
print('ARIMA{}x{}7 - AIC:{}'.format(param, param_seasonal, results.aic))
|
|
|
sa.append(param)
|
|
|
sa.append(param_seasonal)
|
|
|
sa.append(results.aic)
|
|
|
except:
|
|
|
continue
|
|
|
AIC = [i for i in sa if type(i) == np.float64]
|
|
|
AIC_min = min(AIC)
|
|
|
for i in np.arange(2,len(sa),3):
|
|
|
if sa[i] == min(AIC):
|
|
|
param = sa[i-2]
|
|
|
param_seasonal = sa[i-1]
|
|
|
mod = sm.tsa.statespace.SARIMAX(train1,
|
|
|
order=(param),
|
|
|
seasonal_order=(param_seasonal),
|
|
|
enforce_stationarity=False,
|
|
|
enforce_invertibility=False)
|
|
|
print('模型最终定阶为:', (param, param_seasonal))
|
|
|
results = mod.fit()
|
|
|
print(results.summary().tables[1])
|
|
|
fig = plt.figure(figsize=(15, 12))
|
|
|
results.plot_diagnostics(figsize=(15, 12), fig=fig)
|
|
|
plt.show()
|
|
|
pre_10 = results.predict(start=378, end=387,dynamic=True)
|
|
|
out_pre = pd.DataFrame(np.zeros([10,3]),columns = ['real', 'pre', 'error'])
|
|
|
out_pre['real'] = list(test1['on_man'])
|
|
|
out_pre['pre'] = list(pre_10)
|
|
|
# 计算相对误差
|
|
|
error_seasonal = (out_pre.loc[:, 'pre']-out_pre.loc[:,'real'])/out_pre.loc[:,'real']
|
|
|
# 平均相对误差
|
|
|
error_mean = abs(error_seasonal).mean()
|
|
|
print('预测平均相对误差为:', error_mean)
|
|
|
|
|
|
|
|
|
|
|
|
# 代码9-11
|
|
|
# 节假日客流规律
|
|
|
# 由于之前步骤对on_h有修改,所以此处重新载入
|
|
|
holiday = pd.read_csv('../tmp/holiday.csv', index_col=0
|
|
|
,encoding = 'utf-8')
|
|
|
Train_Station = pd.read_csv('../tmp/Train_Station.csv', index_col=0
|
|
|
,encoding='utf-8')
|
|
|
Train_ST111_01 = Train_Station[Train_Station.iloc[:, 0] == 'ST111-01']
|
|
|
on_h = Train_ST111_01.groupby('date')['on_man'].sum()
|
|
|
on_h = pd.DataFrame(on_h)
|
|
|
on_h['date'] = 0
|
|
|
on_h['holiday'] = 0
|
|
|
# 添加日期和类型(工作日或者小长假)
|
|
|
for i in range(len(holiday)):
|
|
|
for j in range(len(on_h)):
|
|
|
if holiday.iloc[i,0] == on_h.index[j]:
|
|
|
on_h.loc[on_h.index[j], 'holiday'] = holiday.iloc[i,1]
|
|
|
on_h.loc[on_h.index[j], 'date'] = holiday.iloc[i,0]
|
|
|
# 2015春节
|
|
|
fig = plt.figure(figsize=(12, 6)) # 设置画布
|
|
|
ax = fig.add_subplot(1, 1, 1)
|
|
|
ax.plot(on_h.loc['2015-01-19':'2015-02-18', 'on_man'], color = 'blue')
|
|
|
ax.plot(on_h.loc['2015-02-18':'2015-02-25', 'on_man'], color = 'red', linestyle=':')
|
|
|
ax.plot(on_h.loc['2015-02-25':'2015-03-01', 'on_man'], color = 'blue')
|
|
|
plt.xlabel('日期')
|
|
|
plt.ylabel('上车人数')
|
|
|
plt.title('2015春节客流量')
|
|
|
plt.legend(['工作日','节假日'])
|
|
|
plt.xticks(rotation = 45)
|
|
|
plt.show()
|
|
|
|
|
|
# 2015劳动节
|
|
|
fig1 = plt.figure(figsize=(12, 6)) # 设置画布
|
|
|
ax1 = fig1.add_subplot(1, 1, 1)
|
|
|
ax1.plot(on_h.loc['2015-04-27':'2015-05-01', 'on_man'], color = 'blue')
|
|
|
ax1.plot(on_h.loc['2015-05-01':'2015-05-04', 'on_man'], color = 'red', linestyle=':')
|
|
|
ax1.plot(on_h.loc['2015-05-04':'2015-05-11', 'on_man'], color = 'blue')
|
|
|
plt.xlabel('日期')
|
|
|
plt.ylabel('上车人数')
|
|
|
plt.title('五一客流量')
|
|
|
plt.legend(['工作日','节假日'])
|
|
|
plt.xticks(rotation = 45)
|
|
|
plt.show()
|
|
|
|
|
|
# 国庆和中秋
|
|
|
fig2 = plt.figure(figsize=(12, 6)) # 设置画布
|
|
|
ax2 = fig2.add_subplot(1, 1, 1)
|
|
|
ax2.plot(on_h.loc['2015-09-21':'2015-09-26', 'on_man'],color = 'blue')
|
|
|
ax2.plot(on_h.loc['2015-09-26':'2015-09-28', 'on_man'],color = 'red', linestyle=':')
|
|
|
ax2.plot(on_h.loc['2015-09-28':'2015-09-30', 'on_man'],color = 'blue')
|
|
|
ax2.plot(on_h.loc['2015-09-30':'2015-10-08', 'on_man'],color = 'red', linestyle=':')
|
|
|
ax2.plot(on_h.loc['2015-10-08':'2015-10-12', 'on_man'],color = 'blue')
|
|
|
plt.xlabel('日期')
|
|
|
plt.ylabel('上车人数')
|
|
|
plt.title('中秋和国庆客流量')
|
|
|
plt.legend(['工作日', '节假日'])
|
|
|
plt.xticks(rotation = 45)
|
|
|
plt.show()
|
|
|
|
|
|
|
|
|
|
|
|
# 代码9-12
|
|
|
# 2015和2016春节客流量比较
|
|
|
compare = pd.DataFrame(on_h.loc['2015-02-05':'2015-02-26', 'on_man'])
|
|
|
compare['2016'] = list(on_h.loc['2016-01-25':'2016-02-15', 'on_man'])
|
|
|
compare.columns = ['2015', '2016']
|
|
|
compare.index = range(len(compare))
|
|
|
plt.plot(compare.index, compare['2015'], linestyle=':')
|
|
|
plt.plot(compare.index, compare['2016'])
|
|
|
plt.legend(['2015', '2016'])
|
|
|
plt.xlabel('日期')
|
|
|
plt.ylabel('客流量')
|
|
|
plt.title('2015和2016春节客流量比较')
|
|
|
|
|
|
|
|
|
|
|
|
# 代码9-13
|
|
|
# 2015春节节假日波动系数
|
|
|
M = on_h.loc['2015-01-01':'2015-02-17']
|
|
|
M = M[M.loc[:,'holiday'] != '小长假']
|
|
|
M1 = on_h.loc['2015-01-01':'2015-02-07']
|
|
|
M1 = M1[M1.loc[:,'holiday'] != '小长假']
|
|
|
B_coef = []
|
|
|
# 春节前10天
|
|
|
for i in on_h.loc['2015-02-08':'2015-02-17',:].index:
|
|
|
B_coef.append('%.2f' % (on_h.loc[i, 'on_man']/math.ceil(M1.iloc[-30:].mean())))
|
|
|
# 春节及春节后两天
|
|
|
for i in on_h.loc['2015-02-18':'2015-02-26',:].index:
|
|
|
B_coef.append('%.2f' % (on_h.loc[i, 'on_man']/math.ceil(M.iloc[-30:].mean())))
|
|
|
B_coef = pd.DataFrame(B_coef)
|
|
|
B_coef.columns = ['on_man']
|
|
|
B_coef[u'on_man'] = B_coef[u'on_man'].astype(float)
|
|
|
fig3 = plt.figure(figsize=(8,6)) # 设置画布
|
|
|
ax3 = fig3.add_subplot(1, 1, 1)
|
|
|
ax3.plot(B_coef.iloc[:,0], color='blue')
|
|
|
plt.xlabel('Index')
|
|
|
plt.ylabel('系数')
|
|
|
plt.title('2015春节客流量波动系数')
|
|
|
# 设置数字标签
|
|
|
for a, b in zip(B_coef.index, B_coef.iloc[:, 0]):
|
|
|
plt.text(a, b, b, ha='center', va='bottom', fontsize=20)
|
|
|
plt.legend()
|
|
|
plt.show()
|
|
|
|
|
|
|
|
|
# 预测2016年春节客流量
|
|
|
MM = on_h.loc['2015-01-01':'2016-02-06',:]
|
|
|
MM = MM[MM.loc[:, 'holiday'] != '小长假']
|
|
|
MM_mean = math.ceil(MM.iloc[-30:].mean())
|
|
|
MM1 = on_h.loc['2015-01-01':'2016-01-27',:]
|
|
|
MM1 = MM1[MM1.loc[:, 'holiday'] != '小长假']
|
|
|
MM1_mean = math.ceil(MM1.iloc[-30:].mean())
|
|
|
pre_2016_b = B_coef.iloc[0:10, 0] * MM1_mean
|
|
|
pre_2016_a = B_coef.iloc[10:, 0] * MM_mean
|
|
|
pre_2016 = pd.DataFrame(on_h.loc['2016-01-28':'2016-02-15', 'on_man'])
|
|
|
pre_2016['pre'] = 0
|
|
|
pre_2016.loc[0:10,'pre'] = list(pre_2016_b)
|
|
|
pre_2016.loc[10:,'pre'] = list(pre_2016_a)
|
|
|
pre_2016.columns=['real', 'pre']
|
|
|
plt.plot(pre_2016.index, pre_2016.real)
|
|
|
plt.plot(pre_2016.index, pre_2016.pre, linestyle=':')
|
|
|
plt.xticks(rotation=45)
|
|
|
plt.legend(['real', 'pre'])
|
|
|
plt.xlabel('日期')
|
|
|
plt.ylabel('客流量')
|
|
|
plt.title('预测2016年春节客流量')
|
|
|
|
|
|
# 计算相对误差
|
|
|
error_pre = (pre_2016.loc[:, 'pre'] - pre_2016.loc[:, 'real'])/pre_2016.loc[:, 'real']
|
|
|
# 平均相对误差
|
|
|
error_pre_mean = abs(error_pre).mean()
|
|
|
print('预测的平均相对误差为:', error_pre_mean)
|