You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

694 lines
30 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

{
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"id": "8f112140-b034-4878-aa19-9620c4add793",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import re\n",
"import os\n",
"from pathlib import Path\n",
"from itertools import chain"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "7a1cf5c7-d90c-4003-a5ca-28e539e94f9c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"找到 438 个文件\n",
"处理文件 1/438: 20150101.xls\n",
"处理文件 2/438: 20150102.xls\n",
"处理文件 3/438: 20150103.xls\n",
"处理文件 4/438: 20150104.xls\n",
"处理文件 5/438: 20150105.xls\n",
"处理文件 6/438: 20150106.xls\n",
"处理文件 7/438: 20150107.xls\n",
"处理文件 8/438: 20150108.xls\n",
"处理文件 9/438: 20150109.xls\n",
"处理文件 10/438: 20150110.xls\n",
"处理文件 11/438: 20150111.xls\n",
"处理文件 12/438: 20150112.xls\n",
"处理文件 13/438: 20150113.xls\n",
"处理文件 14/438: 20150114.xls\n",
"处理文件 15/438: 20150115.xls\n",
"处理文件 16/438: 20150116.xls\n",
"处理文件 17/438: 20150117.xls\n",
"处理文件 18/438: 20150118.xls\n",
"处理文件 19/438: 20150119.xls\n",
"处理文件 20/438: 20150120.xls\n",
"处理文件 21/438: 20150121.xls\n",
"处理文件 22/438: 20150122.xls\n",
"处理文件 23/438: 20150123.xls\n",
"处理文件 24/438: 20150124.xls\n",
"处理文件 25/438: 20150125.xls\n",
"处理文件 26/438: 20150126.xls\n",
"处理文件 27/438: 20150127.xls\n",
"处理文件 28/438: 20150128.xls\n",
"处理文件 29/438: 20150129.xls\n",
"处理文件 30/438: 20150130.xls\n",
"处理文件 31/438: 20150131.xls\n",
"处理文件 32/438: 20150201.xls\n",
"处理文件 33/438: 20150202.xls\n",
"处理文件 34/438: 20150203.xls\n",
"处理文件 35/438: 20150204.xls\n",
"处理文件 36/438: 20150205.xls\n",
"处理文件 37/438: 20150206.xls\n",
"处理文件 38/438: 20150207.xls\n",
"处理文件 39/438: 20150208.xls\n",
"处理文件 40/438: 20150209.xls\n",
"处理文件 41/438: 20150210.xls\n",
"处理文件 42/438: 20150211.xls\n",
"处理文件 43/438: 20150212.xls\n",
"处理文件 44/438: 20150213.xls\n",
"处理文件 45/438: 20150214.xls\n",
"处理文件 46/438: 20150215.xls\n",
"处理文件 47/438: 20150216.xls\n",
"处理文件 48/438: 20150217.xls\n",
"处理文件 49/438: 20150218.xls\n",
"处理文件 50/438: 20150219.xls\n",
"处理文件 51/438: 20150220.xls\n",
"处理文件 52/438: 20150221.xls\n",
"处理文件 53/438: 20150222.xls\n",
"处理文件 54/438: 20150223.xls\n",
"处理文件 55/438: 20150224.xls\n",
"处理文件 56/438: 20150225.xls\n",
"处理文件 57/438: 20150226.xls\n",
"处理文件 58/438: 20150227.xls\n",
"处理文件 59/438: 20150228.xls\n",
"处理文件 60/438: 20150301.xls\n",
"处理文件 61/438: 20150302.xls\n",
"处理文件 62/438: 20150303.xls\n",
"处理文件 63/438: 20150304.xls\n",
"处理文件 64/438: 20150305.xls\n",
"处理文件 65/438: 20150306.xls\n",
"处理文件 66/438: 20150307.xls\n",
"处理文件 67/438: 20150308.xls\n",
"处理文件 68/438: 20150309.xls\n",
"处理文件 69/438: 20150310.xls\n",
"处理文件 70/438: 20150311.xls\n",
"处理文件 71/438: 20150312.xls\n",
"处理文件 72/438: 20150313.xls\n",
"处理文件 73/438: 20150314.xls\n",
"处理文件 74/438: 20150315.xls\n",
"处理文件 75/438: 20150316.xls\n",
"处理文件 76/438: 20150317.xls\n",
"处理文件 77/438: 20150318.xls\n",
"处理文件 78/438: 20150319.xls\n",
"处理文件 79/438: 20150320.xls\n",
"处理文件 80/438: 20150321.xls\n",
"处理文件 81/438: 20150322.xls\n",
"处理文件 82/438: 20150323.xls\n",
"处理文件 83/438: 20150325.xls\n",
"处理文件 84/438: 20150326.xls\n",
"处理文件 85/438: 20150327.xls\n",
"处理文件 86/438: 20150328.xls\n",
"处理文件 87/438: 20150329.xls\n",
"处理文件 88/438: 20150401.xls\n",
"处理文件 89/438: 20150402.xls\n",
"处理文件 90/438: 20150403.xls\n",
"处理文件 91/438: 20150404.xls\n",
"处理文件 92/438: 20150405.xls\n",
"处理文件 93/438: 20150406.xls\n",
"处理文件 94/438: 20150407.xls\n",
"处理文件 95/438: 20150408.xls\n",
"处理文件 96/438: 20150409.xls\n",
"处理文件 97/438: 20150410.xls\n",
"处理文件 98/438: 20150411.xls\n",
"处理文件 99/438: 20150412.xls\n",
"处理文件 100/438: 20150413.xls\n",
"处理文件 101/438: 20150414.xls\n",
"处理文件 102/438: 20150415.xls\n",
"处理文件 103/438: 20150416.xls\n",
"处理文件 104/438: 20150417.xls\n",
"处理文件 105/438: 20150418.xls\n",
"处理文件 106/438: 20150419.xls\n",
"处理文件 107/438: 20150420.xls\n",
"处理文件 108/438: 20150421.xls\n",
"处理文件 109/438: 20150422.xls\n",
"处理文件 110/438: 20150423.xls\n",
"处理文件 111/438: 20150424.xls\n",
"处理文件 112/438: 20150425.xls\n",
"处理文件 113/438: 20150426.xls\n",
"处理文件 114/438: 20150427.xls\n",
"处理文件 115/438: 20150429.xls\n",
"处理文件 116/438: 20150430.xls\n",
"处理文件 117/438: 20150431.xls\n",
"处理文件 118/438: 20150501.xls\n",
"处理文件 119/438: 20150502.xls\n",
"处理文件 120/438: 20150503.xls\n",
"处理文件 121/438: 20150504.xls\n",
"处理文件 122/438: 20150505.xls\n",
"处理文件 123/438: 20150506.xls\n",
"处理文件 124/438: 20150507.xls\n",
"处理文件 125/438: 20150508.xls\n",
"处理文件 126/438: 20150509.xls\n",
"处理文件 127/438: 20150510.xls\n",
"处理文件 128/438: 20150511.xls\n",
"处理文件 129/438: 20150512.xls\n",
"处理文件 130/438: 20150513.xls\n",
"处理文件 131/438: 20150514.xls\n",
"处理文件 132/438: 20150515.xls\n",
"处理文件 133/438: 20150516.xls\n",
"处理文件 134/438: 20150517.xls\n",
"处理文件 135/438: 20150518.xls\n",
"处理文件 136/438: 20150519.xls\n",
"处理文件 137/438: 20150520.xls\n",
"处理文件 138/438: 20150521.xls\n",
"处理文件 139/438: 20150522.xls\n",
"处理文件 140/438: 20150523.xls\n",
"处理文件 141/438: 20150524.xls\n",
"处理文件 142/438: 20150525.xls\n",
"处理文件 143/438: 20150526.xls\n",
"处理文件 144/438: 20150527.xls\n",
"处理文件 145/438: 20150528.xls\n",
"处理文件 146/438: 20150529.xls\n",
"处理文件 147/438: 20150530.xls\n",
"处理文件 148/438: 20150531.xls\n",
"处理文件 149/438: 20150601.xls\n",
"处理文件 150/438: 20150602.xls\n",
"处理文件 151/438: 20150603.xls\n",
"处理文件 152/438: 20150604.xls\n",
"处理文件 153/438: 20150605.xls\n",
"处理文件 154/438: 20150606.xls\n",
"处理文件 155/438: 20150607.xls\n",
"处理文件 156/438: 20150608.xls\n",
"处理文件 157/438: 20150609.xls\n",
"处理文件 158/438: 20150610.xls\n",
"处理文件 159/438: 20150611.xls\n",
"处理文件 160/438: 20150612.xls\n",
"处理文件 161/438: 20150613.xls\n",
"处理文件 162/438: 20150614.xls\n",
"处理文件 163/438: 20150615.xls\n",
"处理文件 164/438: 20150616.xls\n",
"处理文件 165/438: 20150617.xls\n",
"处理文件 166/438: 20150618.xls\n",
"处理文件 167/438: 20150619.xls\n",
"处理文件 168/438: 20150620.xls\n",
"处理文件 169/438: 20150621.xls\n",
"处理文件 170/438: 20150622.xls\n",
"处理文件 171/438: 20150623.xls\n",
"处理文件 172/438: 20150624.xls\n",
"处理文件 173/438: 20150625.xls\n",
"处理文件 174/438: 20150626.xls\n",
"处理文件 175/438: 20150627.xls\n",
"处理文件 176/438: 20150628.xls\n",
"处理文件 177/438: 20150629.xls\n",
"处理文件 178/438: 20150630.xls\n",
"处理文件 179/438: 20150701.xls\n",
"处理文件 180/438: 20150702.xls\n",
"处理文件 181/438: 20150703.xls\n",
"处理文件 182/438: 20150704.xls\n",
"处理文件 183/438: 20150705.xls\n",
"处理文件 184/438: 20150706.xls\n",
"处理文件 185/438: 20150707.xls\n",
"处理文件 186/438: 20150708.xls\n",
"处理文件 187/438: 20150709.xls\n",
"处理文件 188/438: 20150710.xls\n",
"处理文件 189/438: 20150711.xls\n",
"处理文件 190/438: 20150712.xls\n",
"处理文件 191/438: 20150713.xls\n",
"处理文件 192/438: 20150714.xls\n",
"处理文件 193/438: 20150715.xls\n",
"处理文件 194/438: 20150716.xls\n",
"处理文件 195/438: 20150717.xls\n",
"处理文件 196/438: 20150718.xls\n",
"处理文件 197/438: 20150719.xls\n",
"处理文件 198/438: 20150720.xls\n",
"处理文件 199/438: 20150721.xls\n",
"处理文件 200/438: 20150722.xls\n",
"处理文件 201/438: 20150723.xls\n",
"处理文件 202/438: 20150724.xls\n",
"处理文件 203/438: 20150725.xls\n",
"处理文件 204/438: 20150727.xls\n",
"处理文件 205/438: 20150728.xls\n",
"处理文件 206/438: 20150729.xls\n",
"处理文件 207/438: 20150730.xls\n",
"处理文件 208/438: 20150731.xls\n",
"处理文件 209/438: 20150801.xls\n",
"处理文件 210/438: 20150802.xls\n",
"处理文件 211/438: 20150803.xls\n",
"处理文件 212/438: 20150804.xls\n",
"处理文件 213/438: 20150805.xls\n",
"处理文件 214/438: 20150806.xls\n",
"处理文件 215/438: 20150807.xls\n",
"处理文件 216/438: 20150808.xls\n",
"处理文件 217/438: 20150809.xls\n",
"处理文件 218/438: 20150810.xls\n",
"处理文件 219/438: 20150811.xls\n",
"处理文件 220/438: 20150812.xls\n",
"处理文件 221/438: 20150813.xls\n",
"处理文件 222/438: 20150814.xls\n",
"处理文件 223/438: 20150815.xls\n",
"处理文件 224/438: 20150816.xls\n",
"处理文件 225/438: 20150817.xls\n",
"处理文件 226/438: 20150818.xls\n",
"处理文件 227/438: 20150819.xls\n",
"处理文件 228/438: 20150820.xls\n",
"处理文件 229/438: 20150821.xls\n",
"处理文件 230/438: 20150822.xls\n",
"处理文件 231/438: 20150823.xls\n",
"处理文件 232/438: 20150824.xls\n",
"处理文件 233/438: 20150825.xls\n",
"处理文件 234/438: 20150827.xls\n",
"处理文件 235/438: 20150828.xls\n",
"处理文件 236/438: 20150829.xls\n",
"处理文件 237/438: 20150830.xls\n",
"处理文件 238/438: 20150831.xls\n",
"处理文件 239/438: 20150901.xls\n",
"处理文件 240/438: 20150902.xls\n",
"处理文件 241/438: 20150903.xls\n",
"处理文件 242/438: 20150904.xls\n",
"处理文件 243/438: 20150905.xls\n",
"处理文件 244/438: 20150906.xls\n",
"处理文件 245/438: 20150907.xls\n",
"处理文件 246/438: 20150908.xls\n",
"处理文件 247/438: 20150909.xls\n",
"处理文件 248/438: 20150910.xls\n",
"处理文件 249/438: 20150911.xls\n",
"处理文件 250/438: 20150912.xls\n",
"处理文件 251/438: 20150913.xls\n",
"处理文件 252/438: 20150914.xls\n",
"处理文件 253/438: 20150915.xls\n",
"处理文件 254/438: 20150916.xls\n",
"处理文件 255/438: 20150917.xls\n",
"处理文件 256/438: 20150918.xls\n",
"处理文件 257/438: 20150919.xls\n",
"处理文件 258/438: 20150920.xls\n",
"处理文件 259/438: 20150921.xls\n",
"处理文件 260/438: 20150922.xls\n",
"处理文件 261/438: 20150923.xls\n",
"处理文件 262/438: 20150924.xls\n",
"处理文件 263/438: 20150925.xls\n",
"处理文件 264/438: 20150926.xls\n",
"处理文件 265/438: 20150927.xls\n",
"处理文件 266/438: 20150928.xls\n",
"处理文件 267/438: 20150929.xls\n",
"处理文件 268/438: 20150930.xls\n",
"处理文件 269/438: 20151001.xls\n",
"处理文件 270/438: 20151002.xls\n",
"处理文件 271/438: 20151003.xls\n",
"处理文件 272/438: 20151004.xls\n",
"处理文件 273/438: 20151005.xls\n",
"处理文件 274/438: 20151006.xls\n",
"处理文件 275/438: 20151007.xls\n",
"处理文件 276/438: 20151008.xls\n",
"处理文件 277/438: 20151009.xls\n",
"处理文件 278/438: 20151010.xls\n",
"处理文件 279/438: 20151011.xls\n",
"处理文件 280/438: 20151012.xls\n",
"处理文件 281/438: 20151013.xls\n",
"处理文件 282/438: 20151014.xls\n",
"处理文件 283/438: 20151015.xls\n",
"处理文件 284/438: 20151016.xls\n",
"处理文件 285/438: 20151017.xls\n",
"处理文件 286/438: 20151018.xls\n",
"处理文件 287/438: 20151019.xls\n",
"处理文件 288/438: 20151020.xls\n",
"处理文件 289/438: 20151021.xls\n",
"处理文件 290/438: 20151022.xls\n",
"处理文件 291/438: 20151023.xls\n",
"处理文件 292/438: 20151024.xls\n",
"处理文件 293/438: 20151025.xls\n",
"处理文件 294/438: 20151026.xls\n",
"处理文件 295/438: 20151027.xls\n",
"处理文件 296/438: 20151028.xls\n",
"处理文件 297/438: 20151029.xls\n",
"处理文件 298/438: 20151030.xls\n",
"处理文件 299/438: 20151101.xls\n",
"处理文件 300/438: 20151102.xls\n",
"处理文件 301/438: 20151103.xls\n",
"处理文件 302/438: 20151104.xls\n",
"处理文件 303/438: 20151105.xls\n",
"处理文件 304/438: 20151106.xls\n",
"处理文件 305/438: 20151107.xls\n",
"处理文件 306/438: 20151108.xls\n",
"处理文件 307/438: 20151109.xls\n",
"处理文件 308/438: 20151110.xls\n",
"处理文件 309/438: 20151111.xls\n",
"处理文件 310/438: 20151112.xls\n",
"处理文件 311/438: 20151113.xls\n",
"处理文件 312/438: 20151114.xls\n",
"处理文件 313/438: 20151115.xls\n",
"处理文件 314/438: 20151116.xls\n",
"处理文件 315/438: 20151117.xls\n",
"处理文件 316/438: 20151118.xls\n",
"处理文件 317/438: 20151119.xls\n",
"处理文件 318/438: 20151120.xls\n",
"处理文件 319/438: 20151121.xls\n",
"处理文件 320/438: 20151122.xls\n",
"处理文件 321/438: 20151123.xls\n",
"处理文件 322/438: 20151124.xls\n",
"处理文件 323/438: 20151125.xls\n",
"处理文件 324/438: 20151126.xls\n",
"处理文件 325/438: 20151127.xls\n",
"处理文件 326/438: 20151128.xls\n",
"处理文件 327/438: 20151129.xls\n",
"处理文件 328/438: 20151130.xls\n",
"处理文件 329/438: 20151201.xls\n",
"处理文件 330/438: 20151202.xls\n",
"处理文件 331/438: 20151203.xls\n",
"处理文件 332/438: 20151204.xls\n",
"处理文件 333/438: 20151205.xls\n",
"处理文件 334/438: 20151206.xls\n",
"处理文件 335/438: 20151207.xls\n",
"处理文件 336/438: 20151208.xls\n",
"处理文件 337/438: 20151209.xls\n",
"处理文件 338/438: 20151210.xls\n",
"处理文件 339/438: 20151211.xls\n",
"处理文件 340/438: 20151212.xls\n",
"处理文件 341/438: 20151213.xls\n",
"处理文件 342/438: 20151214.xls\n",
"处理文件 343/438: 20151215.xls\n",
"处理文件 344/438: 20151216.xls\n",
"处理文件 345/438: 20151217.xls\n",
"处理文件 346/438: 20151218.xls\n",
"处理文件 347/438: 20151219.xls\n",
"处理文件 348/438: 20151220.xls\n",
"处理文件 349/438: 20151221.xls\n",
"处理文件 350/438: 20151222.xls\n",
"处理文件 351/438: 20151223.xls\n",
"处理文件 352/438: 20151224.xls\n",
"处理文件 353/438: 20151225.xls\n",
"处理文件 354/438: 20151226.xls\n",
"处理文件 355/438: 20151227.xls\n",
"处理文件 356/438: 20151228.xls\n",
"处理文件 357/438: 20151229.xls\n",
"处理文件 358/438: 20151230.xls\n",
"处理文件 359/438: 20151231.xls\n",
"处理文件 360/438: 20160101.xls\n",
"处理文件 361/438: 20160102.xls\n",
"处理文件 362/438: 20160103.xls\n",
"处理文件 363/438: 20160104.xls\n",
"处理文件 364/438: 20160105.xls\n",
"处理文件 365/438: 20160106.xls\n",
"处理文件 366/438: 20160107.xls\n",
"处理文件 367/438: 20160108.xls\n",
"处理文件 368/438: 20160109.xls\n",
"处理文件 369/438: 20160110.xls\n",
"处理文件 370/438: 20160111.xls\n",
"处理文件 371/438: 20160112.xls\n",
"处理文件 372/438: 20160113.xls\n",
"处理文件 373/438: 20160114.xls\n",
"处理文件 374/438: 20160115.xls\n",
"处理文件 375/438: 20160116.xls\n",
"处理文件 376/438: 20160117.xls\n",
"处理文件 377/438: 20160118.xls\n",
"处理文件 378/438: 20160119.xls\n",
"处理文件 379/438: 20160120.xls\n",
"处理文件 380/438: 20160121.xls\n",
"处理文件 381/438: 20160122.xls\n",
"处理文件 382/438: 20160123.xls\n",
"处理文件 383/438: 20160124.xls\n",
"处理文件 384/438: 20160125.xls\n",
"处理文件 385/438: 20160126.xls\n",
"处理文件 386/438: 20160127.xls\n",
"处理文件 387/438: 20160128.xls\n",
"处理文件 388/438: 20160129.xls\n",
"处理文件 389/438: 20160130.xls\n",
"处理文件 390/438: 20160131.xls\n",
"处理文件 391/438: 20160201.xls\n",
"处理文件 392/438: 20160202.xls\n",
"处理文件 393/438: 20160203.xls\n",
"处理文件 394/438: 20160204.xls\n",
"处理文件 395/438: 20160205.xls\n",
"处理文件 396/438: 20160206.xls\n",
"处理文件 397/438: 20160207.xls\n",
"处理文件 398/438: 20160208.xls\n",
"处理文件 399/438: 20160209.xls\n",
"处理文件 400/438: 20160210.xls\n",
"处理文件 401/438: 20160211.xls\n",
"处理文件 402/438: 20160212.xls\n",
"处理文件 403/438: 20160213.xls\n",
"处理文件 404/438: 20160214.xls\n",
"处理文件 405/438: 20160215.xls\n",
"处理文件 406/438: 20160216.xls\n",
"处理文件 407/438: 20160217.xls\n",
"处理文件 408/438: 20160218.xls\n",
"处理文件 409/438: 20160220.xls\n",
"处理文件 410/438: 20160221.xls\n",
"处理文件 411/438: 20160222.xls\n",
"处理文件 412/438: 20160223.xls\n",
"处理文件 413/438: 20160224.xls\n",
"处理文件 414/438: 20160225.xls\n",
"处理文件 415/438: 20160226.xls\n",
"处理文件 416/438: 20160227.xls\n",
"处理文件 417/438: 20160228.xls\n",
"处理文件 418/438: 20160229.xls\n",
"处理文件 419/438: 20160301.xls\n",
"处理文件 420/438: 20160302.xls\n",
"处理文件 421/438: 20160303.xls\n",
"处理文件 422/438: 20160304.xls\n",
"处理文件 423/438: 20160305.xls\n",
"处理文件 424/438: 20160306.xls\n",
"处理文件 425/438: 20160307.xls\n",
"处理文件 426/438: 20160308.xls\n",
"处理文件 427/438: 20160309.xls\n",
"处理文件 428/438: 20160310.xls\n",
"处理文件 429/438: 20160311.xls\n",
"处理文件 430/438: 20160312.xls\n",
"处理文件 431/438: 20160313.xls\n",
"处理文件 432/438: 20160314.xls\n",
"处理文件 433/438: 20160315.xls\n",
"处理文件 434/438: 20160316.xls\n",
"处理文件 435/438: 20160317.xls\n",
"处理文件 436/438: 20160318.xls\n",
"处理文件 437/438: 20160319.xls\n",
"处理文件 438/438: 20160320.xls\n",
"\n",
"数据提取完成,保存至: ../tmp/Station.csv\n"
]
}
],
"source": [
"\n",
"# ==================== 代码9-1: 数据提取与合并 ====================\n",
"\n",
"# 获取所有Excel文件路径\n",
"data_path = Path('../data/201501-201603')\n",
"file_list = []\n",
"for folder in data_path.iterdir():\n",
" if folder.is_dir():\n",
" # 区别于 append 方法append 会将整个对象作为一个元素添加,而 extend 会拆分元素后添加\n",
" file_list.extend([str(f) for f in folder.iterdir() if f.suffix in ['.xls', '.xlsx']])\n",
"\n",
"print(f\"找到 {len(file_list)} 个文件\")\n",
"\n",
"# 创建输出目录\n",
"Path('../tmp').mkdir(parents=True, exist_ok=True)\n",
"save_file = '../tmp/Station.csv'\n",
"\n",
"# 删除旧文件(如果存在)\n",
"if Path(save_file).exists():\n",
" Path(save_file).unlink()\n",
"\n",
"# 处理每个Excel文件\n",
"for file_idx, file_path in enumerate(file_list):\n",
" print(f\"处理文件 {file_idx + 1}/{len(file_list)}: {Path(file_path).name}\")\n",
" \n",
" # 读取Excel数据\n",
" data = pd.read_excel(file_path)\n",
" # 获取元组的总行数\n",
" n_rows = data.shape[0]\n",
" \n",
" # 提取始发日期\n",
" start_dates = []\n",
" for i in range(n_rows):\n",
" if '始发日期' in str(data.iloc[i, 0]):\n",
" date_match = re.findall(r'[0-9\\—]+', data.iloc[i, 0])\n",
" if date_match:\n",
" start_dates.append(date_match[0])\n",
" \n",
" # 提取车次信息\n",
" train_lines = []\n",
" for i in range(n_rows):\n",
" if data.iloc[i, 0] == '上车站':\n",
" line_match = re.findall(r'[A-Z]{2}[0-9]{2}', data.iloc[i-1, 0])\n",
" if line_match:\n",
" train_lines.append(line_match[0])\n",
" \n",
" # 如果没有提取到日期或车次,跳过该文件\n",
" if not start_dates or not train_lines:\n",
" print(f\" 警告: 未找到有效数据,跳过\")\n",
" continue\n",
" \n",
" start_date = start_dates[0]\n",
" \n",
" # 定位关键行位置\n",
" on_station_rows = [i for i, x in enumerate(data.iloc[:, 0]) if x == '上车站']\n",
" on_count_rows = [i for i, x in enumerate(data.iloc[:, 0]) if x == '上车人数合计']\n",
" \n",
" # 定位下车人数合计列\n",
" off_count_cols = []\n",
" for row in on_station_rows:\n",
" cols = [i for i, x in enumerate(data.iloc[row, :]) if x == '下车人数合计']\n",
" off_count_cols.append(cols[0] if cols else None)\n",
" \n",
" # 提取每个区段的数据\n",
" all_records = []\n",
" \n",
" for idx in range(len(on_station_rows)):\n",
" on_row = on_station_rows[idx]\n",
" on_count_row = on_count_rows[idx]\n",
" off_count_col = off_count_cols[idx]\n",
" \n",
" if off_count_col is None:\n",
" continue\n",
" \n",
" # 提取下车站点、人数、时间\n",
" off_stations = data.iloc[on_row+2:on_count_row, 0].tolist()\n",
" off_counts = data.iloc[on_row+2:on_count_row, off_count_col].tolist()\n",
" off_times = data.iloc[on_row+2:on_count_row, 1].tolist()\n",
" \n",
" # 提取上车站点、人数、时间\n",
" on_stations = data.iloc[on_row, 2:off_count_col].tolist()\n",
" on_counts = data.iloc[on_count_row, 2:off_count_col].tolist()\n",
" on_times = data.iloc[on_row+1, 2:off_count_col].tolist()\n",
" \n",
" # 确保长度一致\n",
" n_off = len(off_stations)\n",
" n_on = len(on_stations)\n",
" \n",
" # 构建记录\n",
" for i in range(n_off):\n",
" record = {\n",
" 'on_station': off_stations[i],\n",
" 'on_man': on_counts[i] if i < n_on else 0,\n",
" 'on_time': on_times[i] if i < n_on else 0,\n",
" 'off_man': off_counts[i],\n",
" 'off_time': off_times[i],\n",
" 'date': start_date,\n",
" 'train': train_lines[idx] if idx < len(train_lines) else ''\n",
" }\n",
" all_records.append(record)\n",
" \n",
" # 保存到CSV追加模式\n",
" if all_records:\n",
" df_records = pd.DataFrame(all_records)\n",
" df_records.to_csv(save_file, mode='a', header=not Path(save_file).exists(), \n",
" index=False, encoding='utf-8')\n",
"\n",
"print(f\"\\n数据提取完成保存至: {save_file}\")\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "e5bba694-33a9-489e-8dfd-a9e6dc941f13",
"metadata": {},
"outputs": [],
"source": [
"\n",
"# ==================== 代码9-2: 数据清洗 ====================\n",
"\n",
"# 读取合并后的数据\n",
"Train_Station = pd.read_csv('../tmp/Station.csv', encoding='utf-8')\n",
"\n",
"# 填充缺失值\n",
"Train_Station.fillna(0, inplace=True)\n",
"\n",
"# 传统指令(非 SIMD一条指令只能处理1 个数据(比如 “判断 A 是否等于 0.1”)。\n",
"# SIMD 指令:一条指令能同时处理多个数据(比如 “同时判断 A、B、C、D 是否等于 0.1”)。\n",
"# 就像排队检票:传统方式是 “一个检票员一次查 1 个人”SIMD 是 “一个检票员一次查 4 个人”,效率直接翻 4 倍(甚至更高,取决于 CPU 支持的并行度)。\n",
"# 具体到 “检查 4 个元素是否为 0.1” 的过程\n",
"# 假设我们要检查 4 个 float 类型的元素(每个占 32 位内存),看看它们是否等于 0.1。SIMD 的处理步骤如下:\n",
"# 数据加载:一次性把 4 个元素 “打包” 进寄存器CPU 中有专门的 “向量寄存器”(比如 128 位宽的 XMM 寄存器),可以一次性存储 4 个 32 位 float 数据128 = 32×4。比如内存中连续存放着 4 个元素 [0.1, 0.2, 0.1, 0.3]SIMD 指令会直接把这 4 个元素 “打包” 加载到一个 128 位寄存器中,而不是像传统方式那样分 4 次加载。\n",
"# 单条指令同时比较 4 个元素CPU 提供专门的 SIMD 比较指令比如CMPPS在 x86 架构中),这条指令会对向量寄存器中的 4 个元素同时执行 “是否等于 0.1” 的判断。执行后,会生成一个 “掩码结果”(比如[1, 0, 1, 0],其中 1 表示 “等于 0.1”0 表示 “不等于”),同样存储在向量寄存器中。\n",
"# 批量处理结果\n",
"# 替换字符串'0.1'为0\n",
"Train_Station = Train_Station.replace('0.1', 0)\n",
"Train_Station = Train_Station.replace(' ', 0)\n",
"\n",
"# 标准化日期格式\n",
"def format_date(date_str):\n",
" \"\"\"将日期字符串格式化为YYYY-MM-DD\"\"\"\n",
" numbers = re.findall(r'\\d+', str(date_str))\n",
" if numbers and len(numbers[0]) >= 8:\n",
" date_num = numbers[0]\n",
" return f\"{date_num[0:4]}-{date_num[4:6]}-{date_num[6:8]}\"\n",
" return date_str\n",
"\n",
"Train_Station['date'] = Train_Station['date'].apply(format_date)\n",
"\n",
"# 确保数值列类型正确 errors='coerce' 遇到无法转换为nmeric的值就强制转换为 NaN\n",
"Train_Station['on_man'] = pd.to_numeric(Train_Station['on_man'], errors='coerce').fillna(0)\n",
"Train_Station['off_man'] = pd.to_numeric(Train_Station['off_man'], errors='coerce').fillna(0)\n",
"\n",
"# # 重命名列(如果需要)\n",
"# if 'Station' in Train_Station.columns:\n",
"# Train_Station.rename(columns={'Station': 'train'}, inplace=True)\n",
"\n",
"# 保存清洗后的数据\n",
"Train_Station.to_csv('../tmp/Train_Station.csv', index=False, encoding='utf-8')"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "789b477a-0cb5-4250-ad12-bf810780df8c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"数据清洗完成\n",
"总记录数: 396060\n",
"日期范围: 2015-01-01 至 2016-03-20\n",
"\n",
"数据预览:\n",
" on_station on_man on_time off_man off_time date train\n",
"0 ST074 891 13:00 0 13:00 2015-01-01 PK11\n",
"1 ST219 69 13:41 161 13:39 2015-01-01 PK11\n",
"2 ST054 150 14:37 40 14:34 2015-01-01 PK11\n",
"3 ST036 72 15:22 25 15:19 2015-01-01 PK11\n",
"4 ST313 432 16:30 356 16:19 2015-01-01 PK11\n"
]
}
],
"source": [
"\n",
"print(f\"\\n数据清洗完成\")\n",
"print(f\"总记录数: {len(Train_Station)}\")\n",
"print(f\"日期范围: {Train_Station['date'].min()} 至 {Train_Station['date'].max()}\")\n",
"print(f\"\\n数据预览:\")\n",
"print(Train_Station.head())\n",
"\n",
"# 重新加载以供后续使用\n",
"Train_Station = pd.read_csv('../tmp/Train_Station.csv', encoding='utf-8')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.9"
}
},
"nbformat": 4,
"nbformat_minor": 5
}