|
|
|
|
# 代码9-1
|
|
|
|
|
import numpy as np
|
|
|
|
|
import os
|
|
|
|
|
from itertools import chain
|
|
|
|
|
import pandas as pd
|
|
|
|
|
import re
|
|
|
|
|
import matplotlib.pyplot as plt
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
path = '../data/201501-201603' # 文件夹目录
|
|
|
|
|
filename = os.listdir(path) # 得到文件夹下的所有文件名称
|
|
|
|
|
n_filename = len(filename)
|
|
|
|
|
datalist = []
|
|
|
|
|
date = []
|
|
|
|
|
|
|
|
|
|
for i in range(n_filename):
|
|
|
|
|
name = '../data/201501-201603/' + filename[i]
|
|
|
|
|
datalist.append(['../data/201501-201603/' +
|
|
|
|
|
filename[i] + '/' + j for j in os.listdir(name)])
|
|
|
|
|
date.append(len(datalist[i]))
|
|
|
|
|
filedata = list(chain.from_iterable(datalist)) # 将二维列表转换为一维列表
|
|
|
|
|
n_file = sum(date)
|
|
|
|
|
SaveFile_Name = '../tmp/Station.csv' # 数据合并后要保存的文件名(Station表合并)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def Deal_Fun(b):
|
|
|
|
|
for m in range(b):
|
|
|
|
|
data = pd.read_excel(filedata[m])
|
|
|
|
|
row = data.shape[0]
|
|
|
|
|
Line =[] # 存放始发日期和上车站信息
|
|
|
|
|
Head_d = [] # 存放始发日期
|
|
|
|
|
for i in range(row):
|
|
|
|
|
if '始发日期' in data.iloc[i, 0]:
|
|
|
|
|
Head_d.append(re.findall('[0-9\—]+',data.iloc[i, 0]))
|
|
|
|
|
if data.iloc[i,0] == '上车站':
|
|
|
|
|
Line.append(re.findall('[A-Z]{2}[0-9]{2} ', data.iloc[i-1, 0]))
|
|
|
|
|
Line = pd.DataFrame(Line) # 以数据框形式存放上车站
|
|
|
|
|
Head_d = pd.DataFrame(Head_d)
|
|
|
|
|
Line['Head'] = 0 # 新加存放始发日期的列
|
|
|
|
|
for i in range(len(Line)):
|
|
|
|
|
Line.iloc[i,1] = Head_d.iloc[0, 0]
|
|
|
|
|
# 提取列车站点信息
|
|
|
|
|
# 提取上车站点位置信息
|
|
|
|
|
on_station = [i for i, x in enumerate(data.iloc[:, 0]) if x == '上车站']
|
|
|
|
|
# 提取上车人数合计位置
|
|
|
|
|
on_count = [i for i, x in enumerate(data.iloc[:, 0]) if x == '上车人数合计']
|
|
|
|
|
Size = pd.DataFrame(np.zeros([len(on_station), 2]),
|
|
|
|
|
columns=['on_station', 'on_count'])
|
|
|
|
|
Size['on_station'] = on_station
|
|
|
|
|
Size['on_count'] = on_count
|
|
|
|
|
Size['off_count'] = 0
|
|
|
|
|
for h in range(len(Size.iloc[:, 0])):
|
|
|
|
|
Size.loc[h,'off_count'] = [i for i, x in enumerate(
|
|
|
|
|
data.iloc[Size.iloc[h,0],:]) if x == '下车人数合计'][0]
|
|
|
|
|
# 提取上下车站点、人数和时间
|
|
|
|
|
# 下车站点
|
|
|
|
|
guodu = pd.DataFrame(data.iloc[:,0])
|
|
|
|
|
off_station = []
|
|
|
|
|
for j in range(len(Size.iloc[:,0])):
|
|
|
|
|
off_station.append(guodu.iloc[Size.iloc[j, 0]+2:Size.loc[j, 'on_count'], 0])
|
|
|
|
|
sum_station = 0
|
|
|
|
|
for i in range(len(off_station)):
|
|
|
|
|
sum_station = sum_station + len(off_station[i])
|
|
|
|
|
Out_off = pd.DataFrame(np.zeros([sum_station,3]),
|
|
|
|
|
columns = ['off_station', 'off_man', 'off_time'])
|
|
|
|
|
h = 0
|
|
|
|
|
for i in range(len(off_station)):
|
|
|
|
|
Out_off.iloc[h:h+len(off_station[i]), 0] = list(off_station[i])
|
|
|
|
|
h = h + len(off_station[i])
|
|
|
|
|
# 下车人数
|
|
|
|
|
off_man = []
|
|
|
|
|
for i in range(len(Size)):
|
|
|
|
|
data1 = pd.DataFrame(data.iloc[:, Size.loc[i, 'off_count']])
|
|
|
|
|
off_man.append(data1.iloc[Size.iloc[i,0]+2:Size.loc[i, 'on_count'], 0])
|
|
|
|
|
h = 0
|
|
|
|
|
for i in range(len(off_man)):
|
|
|
|
|
Out_off.iloc[h:h+len(off_man[i]),1] = list(off_man[i])
|
|
|
|
|
h = h + len(off_man[i])
|
|
|
|
|
# 下车时间
|
|
|
|
|
off_time = []
|
|
|
|
|
for i in range(len(Size)):
|
|
|
|
|
data1 = pd.DataFrame(data.iloc[:,1])
|
|
|
|
|
off_time.append(data1.iloc[Size.iloc[i,0]+2:Size.loc[i, 'on_count'], 0])
|
|
|
|
|
h = 0
|
|
|
|
|
for i in range(len(off_time)):
|
|
|
|
|
Out_off.iloc[h:h+len(off_time[i]), 2] = list(off_time[i])
|
|
|
|
|
h = h + len(off_time[i])
|
|
|
|
|
# 上车信息
|
|
|
|
|
Come = pd.DataFrame(np.zeros([sum_station, 3]),
|
|
|
|
|
columns = ['on_station', 'on_man', 'on_time'])
|
|
|
|
|
# 上车站点
|
|
|
|
|
on_station1 = []
|
|
|
|
|
for i in range(len(Size)):
|
|
|
|
|
data1 = pd.DataFrame(data.iloc[Size.loc[i,'on_station'], :])
|
|
|
|
|
on_station1.append(data1.iloc[2:Size.loc[i,'off_count'], 0])
|
|
|
|
|
h = 0
|
|
|
|
|
for i in range(len(on_station1)):
|
|
|
|
|
Come.iloc[h:h+len(on_station1[i]),0] = list(on_station1[i])
|
|
|
|
|
h = h + len(on_station1[i])
|
|
|
|
|
# 上车人数
|
|
|
|
|
on_man = []
|
|
|
|
|
for i in range(len(Size)):
|
|
|
|
|
data1 = pd.DataFrame(data.iloc[Size.loc[i,'on_count'],:])
|
|
|
|
|
on_man.append(data1.iloc[2:Size.loc[i,'off_count'], 0])
|
|
|
|
|
h = 0
|
|
|
|
|
for i in range(len(on_man)):
|
|
|
|
|
Come.iloc[h:h+len(on_man[i]), 1] = list(on_man[i])
|
|
|
|
|
h = h + len(on_man[i])
|
|
|
|
|
# 上车时间
|
|
|
|
|
on_time = []
|
|
|
|
|
for i in range(len(Size)):
|
|
|
|
|
data1 = pd.DataFrame(data.iloc[Size.loc[i,'on_station']+1, :])
|
|
|
|
|
on_time.append(data1.iloc[2:Size.loc[i,'off_count'], 0])
|
|
|
|
|
h = 0
|
|
|
|
|
for i in range(len(on_time)):
|
|
|
|
|
Come.iloc[h:h+len(on_time[i]),2] = list(on_time[i])
|
|
|
|
|
h = h + len(on_time[i])
|
|
|
|
|
Station = pd.DataFrame(np.zeros([len(Out_off),7]),
|
|
|
|
|
columns = ['on_station', 'on_man', 'on_time',
|
|
|
|
|
'off_man', 'off_time', 'date', 'train'])
|
|
|
|
|
Station['on_station'] = list(Out_off.iloc[:, 0])
|
|
|
|
|
Station['off_man'] = list(Out_off['off_man'])
|
|
|
|
|
Station['off_time'] = list(Out_off['off_time'])
|
|
|
|
|
k = 0
|
|
|
|
|
for i in range(len(on_man)):
|
|
|
|
|
Station.loc[k:k+len(on_man[i])-1, 'on_man'] = list(on_man[i])
|
|
|
|
|
Station.loc[k:k+len(on_time[i])-1, 'on_time'] = list(on_time[i])
|
|
|
|
|
Station.loc[k+len(on_time[i]):k-1+len(off_time[i]), 'on_time'] = 0.1
|
|
|
|
|
Station.loc[k+len(on_man[i]):k-1+len(off_man[i]), 'on_man'] = 0.1
|
|
|
|
|
Station.loc[k:k-1+len(off_man[i]), 'date'] = Line.iloc[i, 1]
|
|
|
|
|
Station.loc[k:k-1+len(off_man[i]), 'train'] = Line.iloc[i, 0]
|
|
|
|
|
k = k + len(off_man[i])
|
|
|
|
|
Station.to_csv(SaveFile_Name, encoding="utf_8",
|
|
|
|
|
index=False, header=False, mode='a+')
|
|
|
|
|
Deal_Fun(n_file)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 代码9-2
|
|
|
|
|
Train_Station = pd.read_csv('../tmp/Station.csv',
|
|
|
|
|
header=None, encoding='utf-8')
|
|
|
|
|
Train_Station.columns = ['on_station', 'on_man', 'on_time',
|
|
|
|
|
'off_man', 'off_time', 'date','Station']
|
|
|
|
|
Train_Station.fillna(value=0, inplace=True) # 处理nan值
|
|
|
|
|
|
|
|
|
|
for i in range(len(Train_Station)):
|
|
|
|
|
for j in range(len(Train_Station.iloc[0, :])):
|
|
|
|
|
if Train_Station.iloc[i, j] == '0.1':
|
|
|
|
|
Train_Station.iloc[i, j] = 0
|
|
|
|
|
|
|
|
|
|
s_date = [re.findall('[0-9]+', i)[0][0:4] +
|
|
|
|
|
'-' + re.findall('[0-9]+', i)[0][4:6] +
|
|
|
|
|
'-' + re.findall('[0-9]+', i)[0][6:8]
|
|
|
|
|
for i in Train_Station.loc[:, 'date'] ]
|
|
|
|
|
|
|
|
|
|
Train_Station.loc[:,'date'] = s_date
|
|
|
|
|
# 部分数据为空格,将其替换为0
|
|
|
|
|
ind_on = [i for i in Train_Station.index if Train_Station.loc[i, 'on_man'] == ' ']
|
|
|
|
|
ind_off = [i for i in Train_Station.index if Train_Station.loc[i, 'off_man'] == ' ']
|
|
|
|
|
Train_Station.loc[ind_on, 'on_man'] = 0
|
|
|
|
|
Train_Station.loc[ind_off, 'off_man'] = 0
|
|
|
|
|
Train_Station['on_man'] = Train_Station['on_man'].astype(float)
|
|
|
|
|
Train_Station['off_man'] = Train_Station['off_man'].astype(float)
|
|
|
|
|
Train_Station.to_csv('../tmp/Train_Station.csv', encoding = 'utf-8')
|
|
|
|
|
Train_Station = pd.read_csv('../tmp/Train_Station.csv', index_col=0,
|
|
|
|
|
encoding = 'utf-8')
|