You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
398 lines
15 KiB
398 lines
15 KiB
import io
|
|
import os
|
|
import gzip
|
|
import time
|
|
import json
|
|
import threading
|
|
import pandas as pd
|
|
from seleniumwire import webdriver
|
|
from datetime import datetime as dt,timedelta
|
|
from selenium.webdriver.common.by import By
|
|
from selenium.webdriver.common.keys import Keys
|
|
from selenium.common.exceptions import TimeoutException,StaleElementReferenceException,ElementNotInteractableException,ElementClickInterceptedException # 加载异常
|
|
|
|
|
|
|
|
class FLIGHT(object):
|
|
def __init__(self):
|
|
self.chromeDriverPath = 'C:/Program Files/Google/Chrome/Application/chromedriver' #chromedriver位置
|
|
self.options = webdriver.ChromeOptions() # 创建一个配置对象
|
|
self.options.add_argument('--incognito') # 隐身模式(无痕模式)
|
|
self.options.add_argument("--disable-blink-features")
|
|
self.options.add_argument("--disable-blink-features=AutomationControlled")
|
|
self.options.add_experimental_option("excludeSwitches", ['enable-automation'])# 不显示正在受自动化软件控制
|
|
self.driver = webdriver.Chrome(executable_path=self.chromeDriverPath,chrome_options=self.options)
|
|
self.driver.set_page_load_timeout(300)#设置加载超时阈值
|
|
self.driver.maximize_window()
|
|
self.err=0#错误重试次数
|
|
#前往首页
|
|
self.driver.get('https://flights.ctrip.com/online/channel/domestic')
|
|
|
|
|
|
|
|
def getpage(self):
|
|
try:
|
|
self.driver.find_element(By.CLASS_NAME,'pc_home-jipiao').click()#点击飞机图标,返回主界面
|
|
self.driver.implicitly_wait(5) # seconds
|
|
self.driver.find_elements(By.CLASS_NAME,'radio-label')[0].click()#单程
|
|
|
|
while self.driver.find_elements(By.CSS_SELECTOR,"[aria-label=请选择日期]")[0].get_attribute("value") != self.date:
|
|
|
|
self.driver.find_element(By.CLASS_NAME,'modifyDate.depart-date').click()#点击日期选择
|
|
|
|
for m in self.driver.find_elements(By.CLASS_NAME,'date-picker.date-picker-block'):
|
|
|
|
if int(m.find_element(By.CLASS_NAME,'month').text[:-1]) != int(self.date[5:7]):
|
|
continue
|
|
|
|
for d in m.find_elements(By.CLASS_NAME,'date-d'):
|
|
if int(d.text) == int(self.date[-2:]):
|
|
d.click()
|
|
break
|
|
|
|
self.driver.find_element(By.CLASS_NAME,'search-btn').click()#搜索
|
|
|
|
except:
|
|
print('页面连接失败')
|
|
self.driver.close()
|
|
self.getpage()
|
|
else:
|
|
try:
|
|
##############判断是否存在验证码
|
|
self.driver.find_element(By.ID,"verification-code")
|
|
print('等待2小时后重试')
|
|
time.sleep(7200)
|
|
self.getpage()
|
|
except:
|
|
##############不存在验证码,执行下一步
|
|
self.changecity()
|
|
|
|
def remove_btn(self):
|
|
try:
|
|
js_remove="$('.notice-box').remove();"
|
|
self.driver.execute_script(js_remove)
|
|
except Exception as e:
|
|
print('防疫移除失败',e)
|
|
|
|
|
|
def changecity(self):
|
|
|
|
#移除防疫提醒
|
|
self.remove_btn()
|
|
|
|
try:
|
|
#获取出发地与目的地元素位置
|
|
its=self.driver.find_elements(By.CLASS_NAME,'form-input-v3')
|
|
|
|
#若出发地与目标值不符,则更改出发地
|
|
while self.city[0] not in its[0].get_attribute('value'):
|
|
its[0].click()
|
|
time.sleep(0.5)
|
|
its[0].send_keys(Keys.CONTROL + 'a')
|
|
time.sleep(0.5)
|
|
its[0].send_keys(self.city[0])
|
|
|
|
time.sleep(0.5)
|
|
|
|
#若目的地与目标值不符,则更改目的地
|
|
while self.city[1] not in its[1].get_attribute('value'):
|
|
its[1].click()
|
|
time.sleep(0.5)
|
|
its[1].send_keys(Keys.CONTROL + 'a')
|
|
time.sleep(0.5)
|
|
its[1].send_keys(self.city[1])
|
|
|
|
time.sleep(0.5)
|
|
try:
|
|
#通过低价提醒按钮实现enter键换页
|
|
self.driver.implicitly_wait(5) # seconds
|
|
self.driver.find_elements(By.CLASS_NAME,'low-price-remind')[0].click()
|
|
except IndexError as e:
|
|
print('\n更换城市错误 找不到元素',e)
|
|
#以防万一
|
|
its[1].send_keys(Keys.ENTER)
|
|
|
|
print('\n更换城市成功',self.city[0]+'-'+self.city[1])
|
|
#捕获错误
|
|
except (IndexError,ElementNotInteractableException,StaleElementReferenceException,ElementClickInterceptedException,ElementClickInterceptedException) as e:
|
|
print('\n更换城市错误 元素错误',e)
|
|
self.err+=1
|
|
if self.err<=5:
|
|
self.changecity()
|
|
else:
|
|
self.err=0
|
|
del self.driver.requests
|
|
self.getpage()
|
|
except Exception as e:
|
|
print('\n更换城市错误',e)
|
|
#删除本次请求
|
|
del self.driver.requests
|
|
#从头开始重新执行程序
|
|
self.getpage()
|
|
else:
|
|
#若无错误,执行下一步
|
|
self.err=0
|
|
self.getdata()
|
|
|
|
|
|
|
|
def getdata(self):
|
|
try:
|
|
#等待响应加载完成
|
|
self.predata = self.driver.wait_for_request('/international/search/api/search/batchSearch?.*', timeout=30)
|
|
|
|
rb=dict(json.loads(self.predata.body).get('flightSegments')[0])
|
|
|
|
except TimeoutException as e:
|
|
print('\获取数据错误',e)
|
|
#删除本次请求
|
|
del self.driver.requests
|
|
#从头开始重新执行程序
|
|
self.getpage()
|
|
else:
|
|
#检查数据获取正确性
|
|
if rb['departureCityName'] == self.city[0] and rb['arrivalCityName'] == self.city[1]:
|
|
print('城市获取正确')
|
|
#删除本次请求
|
|
del self.driver.requests
|
|
#若无错误,执行下一步
|
|
self.decode_data()
|
|
else:
|
|
#删除本次请求
|
|
del self.driver.requests
|
|
#重新更换城市
|
|
self.changecity()
|
|
|
|
|
|
|
|
def decode_data(self):
|
|
try:
|
|
buf = io.BytesIO(self.predata.response.body)
|
|
gf = gzip.GzipFile(fileobj = buf)
|
|
self.dedata = gf.read().decode('UTF-8')
|
|
self.dedata=json.loads(self.dedata)
|
|
except:
|
|
print('重新获取数据')
|
|
self.getpage()
|
|
else:
|
|
#若无错误,执行下一步
|
|
self.check_data()
|
|
|
|
|
|
|
|
def check_data(self):
|
|
try:
|
|
self.flightItineraryList=self.dedata['data']['flightItineraryList']
|
|
#倒序遍历,删除转机航班
|
|
for i in range(len(self.flightItineraryList)-1, -1, -1):
|
|
if self.flightItineraryList[i]['flightSegments'][0]['transferCount'] !=0:
|
|
self.flightItineraryList.pop(i)
|
|
if len(self.flightItineraryList):
|
|
#存在直航航班,执行下一步
|
|
self.muti_process()
|
|
else:
|
|
print('不存在直航航班')
|
|
return 0
|
|
except:
|
|
print('不存在直航航班')
|
|
return 0
|
|
|
|
|
|
def muti_process(self):
|
|
processes = []
|
|
|
|
self.flights = pd.DataFrame()
|
|
self.prices = pd.DataFrame()
|
|
#处理航班信息
|
|
processes.append(threading.Thread(target=self.proc_flightSegments))
|
|
#处理票价信息
|
|
processes.append(threading.Thread(target=self.proc_priceList))
|
|
|
|
for pro in processes:
|
|
pro.start()
|
|
for pro in processes:
|
|
pro.join()
|
|
|
|
#若无错误,执行下一步
|
|
self.mergedata()
|
|
|
|
def proc_flightSegments(self):
|
|
for flightlist in self.flightItineraryList:
|
|
flightlist=flightlist['flightSegments'][0]['flightList']
|
|
flightUnitList=dict(flightlist[0])
|
|
|
|
|
|
departureday=flightUnitList['departureDateTime'].split(' ')[0]
|
|
departuretime=flightUnitList['departureDateTime'].split(' ')[1]
|
|
|
|
arrivalday=flightUnitList['arrivalDateTime'].split(' ')[0]
|
|
arrivaltime=flightUnitList['arrivalDateTime'].split(' ')[1]
|
|
|
|
#删除一些不重要的信息
|
|
dellist=['sequenceNo', 'marketAirlineCode',
|
|
'departureProvinceId','departureCityId','departureCityCode','departureAirportShortName','departureTerminal',
|
|
'arrivalProvinceId','arrivalCityId','arrivalCityCode','arrivalAirportShortName','arrivalTerminal',
|
|
'transferDuration','stopList','leakedVisaTagSwitch','trafficType','highLightPlaneNo','mealType',
|
|
'operateAirlineCode','arrivalDateTime','departureDateTime','operateFlightNo','operateAirlineName']
|
|
for value in dellist:
|
|
try:
|
|
flightUnitList.pop(value)
|
|
except:
|
|
continue
|
|
|
|
#更新日期格式
|
|
flightUnitList.update({'departureday': departureday, 'departuretime': departuretime,
|
|
'arrivalday': arrivalday, 'arrivaltime': arrivaltime})
|
|
|
|
|
|
self.flights=pd.concat([self.flights,pd.DataFrame(flightUnitList,index=[0])],ignore_index=True)
|
|
|
|
|
|
|
|
def proc_priceList(self):
|
|
for flightlist in self.flightItineraryList:
|
|
flightNo=flightlist['itineraryId'].split('_')[0]
|
|
priceList=flightlist['priceList']
|
|
|
|
#经济舱,经济舱折扣
|
|
economy,economy_discount=[],[]
|
|
#商务舱,商务舱折扣
|
|
bussiness,bussiness_discount=[],[]
|
|
|
|
for price in priceList:
|
|
adultPrice=price['adultPrice']
|
|
cabin=price['cabin']
|
|
priceUnitList=dict(price['priceUnitList'][0]['flightSeatList'][0])
|
|
discountRate=priceUnitList['discountRate']
|
|
#经济舱
|
|
if cabin=='Y':
|
|
economy.append(adultPrice)
|
|
economy_discount.append(discountRate)
|
|
#商务舱
|
|
elif cabin=='C':
|
|
bussiness.append(adultPrice)
|
|
bussiness_discount.append(discountRate)
|
|
|
|
if economy !=[]:
|
|
try:
|
|
economy_origin=economy[economy_discount.index(1)]
|
|
except:
|
|
economy_origin=int(max(economy)/max(economy_discount))
|
|
|
|
if min(economy_discount) !=1:
|
|
economy_low=min(economy)
|
|
economy_cut=min(economy_discount)
|
|
else:
|
|
economy_low=''
|
|
economy_cut=''
|
|
|
|
else:
|
|
economy_origin=''
|
|
economy_low=''
|
|
economy_cut=''
|
|
|
|
|
|
if bussiness !=[]:
|
|
try:
|
|
bussiness_origin=bussiness[bussiness_discount.index(1)]
|
|
except:
|
|
bussiness_origin=int(max(bussiness)/max(bussiness_discount))
|
|
|
|
if min(bussiness_discount) !=1:
|
|
bussiness_low=min(bussiness)
|
|
bussiness_cut=min(bussiness_discount)
|
|
else:
|
|
bussiness_low=''
|
|
bussiness_cut=''
|
|
|
|
else:
|
|
bussiness_origin=''
|
|
bussiness_low=''
|
|
bussiness_cut=''
|
|
|
|
price_info={'flightNo':flightNo,
|
|
'economy_origin':economy_origin,'economy_low':economy_low,'economy_cut':economy_cut,
|
|
'bussiness_origin':bussiness_origin,'bussiness_low':bussiness_low,'bussiness_cut':bussiness_cut}
|
|
|
|
#self.prices=self.prices.append(price_info,ignore_index=True)
|
|
self.prices=pd.concat([self.prices,pd.DataFrame(price_info,index=[0])],ignore_index=True)
|
|
|
|
|
|
|
|
def mergedata(self):
|
|
try:
|
|
self.df = self.flights.merge(self.prices,on=['flightNo'])
|
|
|
|
self.df['数据获取日期']=dt.now().strftime('%Y-%m-%d')
|
|
|
|
#对pandas的columns进行重命名
|
|
order=['数据获取日期','航班号','航空公司',
|
|
'出发日期','出发时间','到达日期','到达时间','飞行时长','出发国家','出发城市','出发机场','出发机场三字码',
|
|
'到达国家','到达城市','到达机场','到达机场三字码','飞机型号','飞机尺寸','飞机型号三字码',
|
|
'经济舱原价','经济舱最低价','经济舱折扣','商务舱原价','商务舱最低价','商务舱折扣',
|
|
'到达准点率','停留次数']
|
|
|
|
origin=['数据获取日期','flightNo','marketAirlineName',
|
|
'departureday','departuretime','arrivalday','arrivaltime','duration',
|
|
'departureCountryName','departureCityName','departureAirportName','departureAirportCode',
|
|
'arrivalCountryName','arrivalCityName','arrivalAirportName','arrivalAirportCode',
|
|
'aircraftName','aircraftSize','aircraftCode',
|
|
'economy_origin','economy_low','economy_cut',
|
|
'bussiness_origin','bussiness_low','bussiness_cut',
|
|
'arrivalPunctuality','stopCount']
|
|
|
|
columns=dict(zip(origin,order))
|
|
|
|
self.df=self.df.rename(columns=columns)
|
|
|
|
self.df = self.df[order]
|
|
|
|
|
|
if not os.path.exists(self.date):
|
|
os.makedirs(self.date)
|
|
|
|
filename=os.getcwd()+'\\'+self.date+'\\'+self.date+'-'+self.city[0]+'-'+self.city[1]+'.csv'
|
|
|
|
self.df.to_csv(filename,encoding='GB18030',index=False)
|
|
|
|
print('\n数据爬取完成',filename)
|
|
except Exception as e:
|
|
print('合并数据失败',e)
|
|
|
|
|
|
def demain(self,citys):
|
|
#设置出发日期
|
|
self.date=dt.now()+timedelta(days=1)
|
|
self.date=self.date.strftime('%Y-%m-%d')
|
|
|
|
for city in citys:
|
|
self.city=city
|
|
|
|
if citys.index(city)==0:
|
|
#第一次运行
|
|
self.getpage()
|
|
else:
|
|
#后续运行只需更换出发与目的地
|
|
self.changecity()
|
|
|
|
#运行结束退出
|
|
self.driver.quit()
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
citys=[]
|
|
city=['上海','广州','深圳','北京']
|
|
#形成城市对
|
|
ytic=list(reversed(city))
|
|
for m in city:
|
|
for n in ytic:
|
|
if m==n:
|
|
continue
|
|
else:
|
|
citys.append([m,n])
|
|
fly = FLIGHT()
|
|
fly.demain(citys)
|
|
print('\n程序运行完成!!!!')
|
|
|