You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

167 lines
7.3 KiB

2 weeks ago
# 代码9-1
import numpy as np
import os
from itertools import chain
import pandas as pd
import re
import matplotlib.pyplot as plt
path = '../data/201501-201603' # 文件夹目录
filename = os.listdir(path) # 得到文件夹下的所有文件名称
n_filename = len(filename)
datalist = []
date = []
for i in range(n_filename):
name = '../data/201501-201603/' + filename[i]
datalist.append(['../data/201501-201603/' +
filename[i] + '/' + j for j in os.listdir(name)])
date.append(len(datalist[i]))
filedata = list(chain.from_iterable(datalist)) # 将二维列表转换为一维列表
n_file = sum(date)
SaveFile_Name = '../tmp/Station.csv' # 数据合并后要保存的文件名Station表合并
def Deal_Fun(b):
for m in range(b):
data = pd.read_excel(filedata[m])
row = data.shape[0]
Line =[] # 存放始发日期和上车站信息
Head_d = [] # 存放始发日期
for i in range(row):
if '始发日期' in data.iloc[i, 0]:
Head_d.append(re.findall('[0-9\—]+',data.iloc[i, 0]))
if data.iloc[i,0] == '上车站':
Line.append(re.findall('[A-Z]{2}[0-9]{2} ', data.iloc[i-1, 0]))
Line = pd.DataFrame(Line) # 以数据框形式存放上车站
Head_d = pd.DataFrame(Head_d)
Line['Head'] = 0 # 新加存放始发日期的列
for i in range(len(Line)):
Line.iloc[i,1] = Head_d.iloc[0, 0]
# 提取列车站点信息
# 提取上车站点位置信息
on_station = [i for i, x in enumerate(data.iloc[:, 0]) if x == '上车站']
# 提取上车人数合计位置
on_count = [i for i, x in enumerate(data.iloc[:, 0]) if x == '上车人数合计']
Size = pd.DataFrame(np.zeros([len(on_station), 2]),
columns=['on_station', 'on_count'])
Size['on_station'] = on_station
Size['on_count'] = on_count
Size['off_count'] = 0
for h in range(len(Size.iloc[:, 0])):
Size.loc[h,'off_count'] = [i for i, x in enumerate(
data.iloc[Size.iloc[h,0],:]) if x == '下车人数合计'][0]
# 提取上下车站点、人数和时间
# 下车站点
guodu = pd.DataFrame(data.iloc[:,0])
off_station = []
for j in range(len(Size.iloc[:,0])):
off_station.append(guodu.iloc[Size.iloc[j, 0]+2:Size.loc[j, 'on_count'], 0])
sum_station = 0
for i in range(len(off_station)):
sum_station = sum_station + len(off_station[i])
Out_off = pd.DataFrame(np.zeros([sum_station,3]),
columns = ['off_station', 'off_man', 'off_time'])
h = 0
for i in range(len(off_station)):
Out_off.iloc[h:h+len(off_station[i]), 0] = list(off_station[i])
h = h + len(off_station[i])
# 下车人数
off_man = []
for i in range(len(Size)):
data1 = pd.DataFrame(data.iloc[:, Size.loc[i, 'off_count']])
off_man.append(data1.iloc[Size.iloc[i,0]+2:Size.loc[i, 'on_count'], 0])
h = 0
for i in range(len(off_man)):
Out_off.iloc[h:h+len(off_man[i]),1] = list(off_man[i])
h = h + len(off_man[i])
# 下车时间
off_time = []
for i in range(len(Size)):
data1 = pd.DataFrame(data.iloc[:,1])
off_time.append(data1.iloc[Size.iloc[i,0]+2:Size.loc[i, 'on_count'], 0])
h = 0
for i in range(len(off_time)):
Out_off.iloc[h:h+len(off_time[i]), 2] = list(off_time[i])
h = h + len(off_time[i])
# 上车信息
Come = pd.DataFrame(np.zeros([sum_station, 3]),
columns = ['on_station', 'on_man', 'on_time'])
# 上车站点
on_station1 = []
for i in range(len(Size)):
data1 = pd.DataFrame(data.iloc[Size.loc[i,'on_station'], :])
on_station1.append(data1.iloc[2:Size.loc[i,'off_count'], 0])
h = 0
for i in range(len(on_station1)):
Come.iloc[h:h+len(on_station1[i]),0] = list(on_station1[i])
h = h + len(on_station1[i])
# 上车人数
on_man = []
for i in range(len(Size)):
data1 = pd.DataFrame(data.iloc[Size.loc[i,'on_count'],:])
on_man.append(data1.iloc[2:Size.loc[i,'off_count'], 0])
h = 0
for i in range(len(on_man)):
Come.iloc[h:h+len(on_man[i]), 1] = list(on_man[i])
h = h + len(on_man[i])
# 上车时间
on_time = []
for i in range(len(Size)):
data1 = pd.DataFrame(data.iloc[Size.loc[i,'on_station']+1, :])
on_time.append(data1.iloc[2:Size.loc[i,'off_count'], 0])
h = 0
for i in range(len(on_time)):
Come.iloc[h:h+len(on_time[i]),2] = list(on_time[i])
h = h + len(on_time[i])
Station = pd.DataFrame(np.zeros([len(Out_off),7]),
columns = ['on_station', 'on_man', 'on_time',
'off_man', 'off_time', 'date', 'train'])
Station['on_station'] = list(Out_off.iloc[:, 0])
Station['off_man'] = list(Out_off['off_man'])
Station['off_time'] = list(Out_off['off_time'])
k = 0
for i in range(len(on_man)):
Station.loc[k:k+len(on_man[i])-1, 'on_man'] = list(on_man[i])
Station.loc[k:k+len(on_time[i])-1, 'on_time'] = list(on_time[i])
Station.loc[k+len(on_time[i]):k-1+len(off_time[i]), 'on_time'] = 0.1
Station.loc[k+len(on_man[i]):k-1+len(off_man[i]), 'on_man'] = 0.1
Station.loc[k:k-1+len(off_man[i]), 'date'] = Line.iloc[i, 1]
Station.loc[k:k-1+len(off_man[i]), 'train'] = Line.iloc[i, 0]
k = k + len(off_man[i])
Station.to_csv(SaveFile_Name, encoding="utf_8",
index=False, header=False, mode='a+')
Deal_Fun(n_file)
# 代码9-2
Train_Station = pd.read_csv('../tmp/Station.csv',
header=None, encoding='utf-8')
Train_Station.columns = ['on_station', 'on_man', 'on_time',
'off_man', 'off_time', 'date','Station']
Train_Station.fillna(value=0, inplace=True) # 处理nan值
for i in range(len(Train_Station)):
for j in range(len(Train_Station.iloc[0, :])):
if Train_Station.iloc[i, j] == '0.1':
Train_Station.iloc[i, j] = 0
s_date = [re.findall('[0-9]+', i)[0][0:4] +
'-' + re.findall('[0-9]+', i)[0][4:6] +
'-' + re.findall('[0-9]+', i)[0][6:8]
for i in Train_Station.loc[:, 'date'] ]
Train_Station.loc[:,'date'] = s_date
# 部分数据为空格将其替换为0
ind_on = [i for i in Train_Station.index if Train_Station.loc[i, 'on_man'] == ' ']
ind_off = [i for i in Train_Station.index if Train_Station.loc[i, 'off_man'] == ' ']
Train_Station.loc[ind_on, 'on_man'] = 0
Train_Station.loc[ind_off, 'off_man'] = 0
Train_Station['on_man'] = Train_Station['on_man'].astype(float)
Train_Station['off_man'] = Train_Station['off_man'].astype(float)
Train_Station.to_csv('../tmp/Train_Station.csv', encoding = 'utf-8')
Train_Station = pd.read_csv('../tmp/Train_Station.csv', index_col=0,
encoding = 'utf-8')