You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

167 lines
7.3 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

# 代码9-1
import numpy as np
import os
from itertools import chain
import pandas as pd
import re
import matplotlib.pyplot as plt
path = '../data/201501-201603' # 文件夹目录
filename = os.listdir(path) # 得到文件夹下的所有文件名称
n_filename = len(filename)
datalist = []
date = []
for i in range(n_filename):
name = '../data/201501-201603/' + filename[i]
datalist.append(['../data/201501-201603/' +
filename[i] + '/' + j for j in os.listdir(name)])
date.append(len(datalist[i]))
filedata = list(chain.from_iterable(datalist)) # 将二维列表转换为一维列表
n_file = sum(date)
SaveFile_Name = '../tmp/Station.csv' # 数据合并后要保存的文件名Station表合并
def Deal_Fun(b):
for m in range(b):
data = pd.read_excel(filedata[m])
row = data.shape[0]
Line =[] # 存放始发日期和上车站信息
Head_d = [] # 存放始发日期
for i in range(row):
if '始发日期' in data.iloc[i, 0]:
Head_d.append(re.findall('[0-9\—]+',data.iloc[i, 0]))
if data.iloc[i,0] == '上车站':
Line.append(re.findall('[A-Z]{2}[0-9]{2} ', data.iloc[i-1, 0]))
Line = pd.DataFrame(Line) # 以数据框形式存放上车站
Head_d = pd.DataFrame(Head_d)
Line['Head'] = 0 # 新加存放始发日期的列
for i in range(len(Line)):
Line.iloc[i,1] = Head_d.iloc[0, 0]
# 提取列车站点信息
# 提取上车站点位置信息
on_station = [i for i, x in enumerate(data.iloc[:, 0]) if x == '上车站']
# 提取上车人数合计位置
on_count = [i for i, x in enumerate(data.iloc[:, 0]) if x == '上车人数合计']
Size = pd.DataFrame(np.zeros([len(on_station), 2]),
columns=['on_station', 'on_count'])
Size['on_station'] = on_station
Size['on_count'] = on_count
Size['off_count'] = 0
for h in range(len(Size.iloc[:, 0])):
Size.loc[h,'off_count'] = [i for i, x in enumerate(
data.iloc[Size.iloc[h,0],:]) if x == '下车人数合计'][0]
# 提取上下车站点、人数和时间
# 下车站点
guodu = pd.DataFrame(data.iloc[:,0])
off_station = []
for j in range(len(Size.iloc[:,0])):
off_station.append(guodu.iloc[Size.iloc[j, 0]+2:Size.loc[j, 'on_count'], 0])
sum_station = 0
for i in range(len(off_station)):
sum_station = sum_station + len(off_station[i])
Out_off = pd.DataFrame(np.zeros([sum_station,3]),
columns = ['off_station', 'off_man', 'off_time'])
h = 0
for i in range(len(off_station)):
Out_off.iloc[h:h+len(off_station[i]), 0] = list(off_station[i])
h = h + len(off_station[i])
# 下车人数
off_man = []
for i in range(len(Size)):
data1 = pd.DataFrame(data.iloc[:, Size.loc[i, 'off_count']])
off_man.append(data1.iloc[Size.iloc[i,0]+2:Size.loc[i, 'on_count'], 0])
h = 0
for i in range(len(off_man)):
Out_off.iloc[h:h+len(off_man[i]),1] = list(off_man[i])
h = h + len(off_man[i])
# 下车时间
off_time = []
for i in range(len(Size)):
data1 = pd.DataFrame(data.iloc[:,1])
off_time.append(data1.iloc[Size.iloc[i,0]+2:Size.loc[i, 'on_count'], 0])
h = 0
for i in range(len(off_time)):
Out_off.iloc[h:h+len(off_time[i]), 2] = list(off_time[i])
h = h + len(off_time[i])
# 上车信息
Come = pd.DataFrame(np.zeros([sum_station, 3]),
columns = ['on_station', 'on_man', 'on_time'])
# 上车站点
on_station1 = []
for i in range(len(Size)):
data1 = pd.DataFrame(data.iloc[Size.loc[i,'on_station'], :])
on_station1.append(data1.iloc[2:Size.loc[i,'off_count'], 0])
h = 0
for i in range(len(on_station1)):
Come.iloc[h:h+len(on_station1[i]),0] = list(on_station1[i])
h = h + len(on_station1[i])
# 上车人数
on_man = []
for i in range(len(Size)):
data1 = pd.DataFrame(data.iloc[Size.loc[i,'on_count'],:])
on_man.append(data1.iloc[2:Size.loc[i,'off_count'], 0])
h = 0
for i in range(len(on_man)):
Come.iloc[h:h+len(on_man[i]), 1] = list(on_man[i])
h = h + len(on_man[i])
# 上车时间
on_time = []
for i in range(len(Size)):
data1 = pd.DataFrame(data.iloc[Size.loc[i,'on_station']+1, :])
on_time.append(data1.iloc[2:Size.loc[i,'off_count'], 0])
h = 0
for i in range(len(on_time)):
Come.iloc[h:h+len(on_time[i]),2] = list(on_time[i])
h = h + len(on_time[i])
Station = pd.DataFrame(np.zeros([len(Out_off),7]),
columns = ['on_station', 'on_man', 'on_time',
'off_man', 'off_time', 'date', 'train'])
Station['on_station'] = list(Out_off.iloc[:, 0])
Station['off_man'] = list(Out_off['off_man'])
Station['off_time'] = list(Out_off['off_time'])
k = 0
for i in range(len(on_man)):
Station.loc[k:k+len(on_man[i])-1, 'on_man'] = list(on_man[i])
Station.loc[k:k+len(on_time[i])-1, 'on_time'] = list(on_time[i])
Station.loc[k+len(on_time[i]):k-1+len(off_time[i]), 'on_time'] = 0.1
Station.loc[k+len(on_man[i]):k-1+len(off_man[i]), 'on_man'] = 0.1
Station.loc[k:k-1+len(off_man[i]), 'date'] = Line.iloc[i, 1]
Station.loc[k:k-1+len(off_man[i]), 'train'] = Line.iloc[i, 0]
k = k + len(off_man[i])
Station.to_csv(SaveFile_Name, encoding="utf_8",
index=False, header=False, mode='a+')
Deal_Fun(n_file)
# 代码9-2
Train_Station = pd.read_csv('../tmp/Station.csv',
header=None, encoding='utf-8')
Train_Station.columns = ['on_station', 'on_man', 'on_time',
'off_man', 'off_time', 'date','Station']
Train_Station.fillna(value=0, inplace=True) # 处理nan值
for i in range(len(Train_Station)):
for j in range(len(Train_Station.iloc[0, :])):
if Train_Station.iloc[i, j] == '0.1':
Train_Station.iloc[i, j] = 0
s_date = [re.findall('[0-9]+', i)[0][0:4] +
'-' + re.findall('[0-9]+', i)[0][4:6] +
'-' + re.findall('[0-9]+', i)[0][6:8]
for i in Train_Station.loc[:, 'date'] ]
Train_Station.loc[:,'date'] = s_date
# 部分数据为空格将其替换为0
ind_on = [i for i in Train_Station.index if Train_Station.loc[i, 'on_man'] == ' ']
ind_off = [i for i in Train_Station.index if Train_Station.loc[i, 'off_man'] == ' ']
Train_Station.loc[ind_on, 'on_man'] = 0
Train_Station.loc[ind_off, 'off_man'] = 0
Train_Station['on_man'] = Train_Station['on_man'].astype(float)
Train_Station['off_man'] = Train_Station['off_man'].astype(float)
Train_Station.to_csv('../tmp/Train_Station.csv', encoding = 'utf-8')
Train_Station = pd.read_csv('../tmp/Train_Station.csv', index_col=0,
encoding = 'utf-8')