Merge branch 'dev-clawer' into dev

Merge the main System and FlightInformation-clawer into the same branch to
form a complete system
dev
Lin 6 days ago
commit a29f33b493

@ -0,0 +1,6 @@
projectKey=clawer
serverUrl=http://localhost:9000
serverVersion=7.8.0.26217
dashboardUrl=http://localhost:9000/dashboard?id=clawer
ceTaskId=AZMv5JVBnAUFl5pPDUTm
ceTaskUrl=http://localhost:9000/api/ce/task?id=AZMv5JVBnAUFl5pPDUTm

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2024 Suysker
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

File diff suppressed because it is too large Load Diff

@ -0,0 +1,157 @@
import os
import re
import subprocess
# Global variables for proxy switch count
proxy_switch_count = 0
iface_ipv6_dict = {}
def is_root():
return os.geteuid() == 0
def interface_usable(interface_name, skip_check=False, ipv6_address='2400:3200::1', max_retries=3):
if skip_check:
return True
current_try = 0
while current_try < max_retries:
try:
cmd_result = subprocess.run(["ping", "-c", "1", "-I", interface_name, ipv6_address], stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=5)
if cmd_result.returncode == 0:
return True # 成功ping通直接返回True
except subprocess.TimeoutExpired:
print(f"Ping attempt {current_try + 1} of {max_retries} timed out. Retrying...")
except subprocess.SubprocessError as e:
# 捕获其他subprocess相关的异常
print(f"An error occurred while trying to ping: {e}. Retrying...")
current_try += 1
return False # 所有尝试后仍未成功返回False
def get_existing_interfaces(base_interface='eth0'):
cmd_result = subprocess.run(["ip", "addr", "show"], stdout=subprocess.PIPE)
output = cmd_result.stdout.decode()
# 匹配接口名称
iface_pattern = re.compile(re.escape(base_interface) + r'_([0-9]+)@')
iface_matches = iface_pattern.findall(output)
# 构建完整的接口名称列表
interfaces = [f"{base_interface}_{match}" for match in iface_matches]
# 初始化字典来存储接口名称与其IPv6地址的映射
iface_ipv6_dict = {}
for iface in interfaces:
# 对于每个接口查找其IPv6地址这里假设只提取第一个IPv6地址
# 注意需要确保只匹配特定接口的IPv6地址因此使用iface作为正则表达式的一部分
cmd_result = subprocess.run(["ip", "addr", "show", iface], stdout=subprocess.PIPE)
output = cmd_result.stdout.decode()
ipv6_pattern = re.compile(r"inet6\s+([0-9a-f:]+)\/\d+")
ipv6_matches = ipv6_pattern.findall(output)
# 过滤掉以"fe80"开头的IPv6地址
ipv6_addresses = [addr for addr in ipv6_matches if not addr.startswith("fe80")]
# 如果存在非链路本地的IPv6地址只取第一个地址
if ipv6_addresses:
iface_ipv6_dict[iface] = ipv6_addresses[0]
return iface_ipv6_dict
def execute_ip6tables_command(command):
sudo_cmd = ["sudo"] if not is_root() else []
cmd = sudo_cmd + command.split()
subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
def switch_proxy_server(mode='normal'):
global proxy_switch_count
global iface_ipv6_dict
if mode == 'normal':
if iface_ipv6_dict:
proxy_switch_count += 1
proxy_index = proxy_switch_count % len(iface_ipv6_dict)
selected_interface = list(iface_ipv6_dict.keys())[proxy_index]
ipv6_address = iface_ipv6_dict[selected_interface]
# 清空自定义链
execute_ip6tables_command('ip6tables -t nat -F FAKE_IPV6_CHAIN')
# 添加SNAT规则
execute_ip6tables_command(f'ip6tables -t nat -A FAKE_IPV6_CHAIN -j SNAT --to-source {ipv6_address}')
print(f"Using interface: {selected_interface}, Connecting to: {ipv6_address}")
def create_ipv6_addresses(n, base_interface='eth0', delete_interface=True):
sudo_cmd = ["sudo"] if not is_root() else []
if delete_interface:
delete_ipv6_addresses(base_interface)
existing_interfaces = list(get_existing_interfaces(base_interface).keys())
interfaces = []
for i in range(1, n + 1):
interface_name = f"{base_interface}_{i}"
# Check if the interface exists, if yes, delete it first
if interface_name in existing_interfaces:
if interface_usable(interface_name):
print(f"Interface {interface_name} already exists. Skipping creation.")
interfaces.append(interface_name)
continue
else:
subprocess.run(sudo_cmd + ["ip", "link", "delete", interface_name])
# Now add the interface
subprocess.run(sudo_cmd + ["ip", "link", "add", "link", base_interface, interface_name, "type", "macvlan", "mode", "bridge"])
subprocess.run(sudo_cmd + ["ip", "link", "set", interface_name, "up"])
#subprocess.run(sudo_cmd + ["dhclient", "-6", "-nw", interface_name])
interfaces.append(interface_name)
return interfaces
def delete_ipv6_addresses(base_interface='eth0'):
sudo_cmd = ["sudo"] if not is_root() else []
existing_interfaces = list(get_existing_interfaces(base_interface).keys())
for interface_name in existing_interfaces:
subprocess.run(sudo_cmd + ["ip", "link", "delete", interface_name])
def stop_proxy_servers(base_interface='eth0', delete_interface=True):
# 删除流量重定向到自定义链
execute_ip6tables_command('ip6tables -t nat -D POSTROUTING -j FAKE_IPV6_CHAIN')
# 删除自定义链
execute_ip6tables_command('ip6tables -t nat -X FAKE_IPV6_CHAIN')
if delete_interface:
print("正在关闭代理服务器...")
print("删除IPv6地址...")
delete_ipv6_addresses(base_interface)
print("代理服务器已关闭.")
else:
print("正在关闭代理服务器...")
print("代理服务器已关闭.")
def start_proxy_servers(n, mode='normal', base_interface='eth0', delete_interface=True):
global iface_ipv6_dict
interfaces = create_ipv6_addresses(n, base_interface, delete_interface)
#获取生成的接口及IP
iface_ipv6_dict = get_existing_interfaces(base_interface)
if iface_ipv6_dict:
# 删除流量重定向到自定义链
execute_ip6tables_command('ip6tables -t nat -D POSTROUTING -j FAKE_IPV6_CHAIN')
# 删除自定义链
execute_ip6tables_command('ip6tables -t nat -X FAKE_IPV6_CHAIN')
# 创建自定义链
execute_ip6tables_command('ip6tables -t nat -N FAKE_IPV6_CHAIN')
# 流量重定向到自定义链
execute_ip6tables_command(f'ip6tables -t nat -A POSTROUTING -o {base_interface} -j FAKE_IPV6_CHAIN')
if mode == 'normal':
selected_interface = list(iface_ipv6_dict.keys())[0]
ipv6_address = iface_ipv6_dict[selected_interface]
# 添加SNAT规则
execute_ip6tables_command(f'ip6tables -t nat -A FAKE_IPV6_CHAIN -j SNAT --to-source {ipv6_address}')
print(f"Using interface: {selected_interface}, Connecting to: {ipv6_address}")
elif mode == 'random':
for index, (interface, ipv6_address) in enumerate(iface_ipv6_dict.items()):
adjusted_probability = 1/(len(iface_ipv6_dict)-index)
execute_ip6tables_command(f'ip6tables -t nat -A FAKE_IPV6_CHAIN -m statistic --mode random --probability {adjusted_probability} -j SNAT --to-source {ipv6_address}')

@ -0,0 +1,50 @@
# Ctrip-Crawler
## 概述
Ctrip-Crawler 是一个携程航班信息的专业爬虫工具,主要基于 Selenium 框架进行实现。
request 方法访问携程 API 的方法,由于 IP 限制和 JS 逆向工程的挑战,该途径已不再适用。(报错)
携程支持IPV6访问因此可以通过生成大量IPV6规避 IP 限制。
## 主要特性
Selenium 自动化框架:与直接请求 API 的方法不同,该项目基于 Selenium提供高度可定制和交互式的浏览器模拟。
灵活的错误处理机制:针对不同类型的异常(如超时、验证码出现、未知错误等),实施相应的处理策略,包括重试和人工干预。
IP限制解决方案利用页面特性和用户模拟规避了 IP 限制,提高了爬取稳定性。
数据校验与解析:对获取的数据进行严格的数据质量和完整性校验,包括 gzip 解压缩和 JSON 格式解析。
版本迭代与优化V2版本解决了验证码问题V3版本提高了系统的稳定性和可用性V3.5版本增加了linux系统下多IPV6网口的生成与代理
## 文档和教程
详细的使用指南和开发文档可在以下博客中查看:
[基于selenium的携程机票爬取程序](https://blog.suysker.xyz/archives/35)
[基于selenium的携程机票爬取程序V2](https://blog.suysker.xyz/archives/139)
[基于request的携程机票爬取程序](https://blog.suysker.xyz/archives/37)
[基于request的航班历史票价爬取](https://blog.suysker.xyz/archives/36)
## TO DO
V4.0增加多线程分片运行……
## 贡献与反馈
如果你有更好的优化建议或发现任何 bug请通过 Issues 或 Pull Requests 与我们交流。我们非常欢迎各种形式的贡献!

@ -0,0 +1,73 @@
import pandas as pd
import os
from datetime import datetime, timedelta
def get_departure_destination(file_name):
name_without_extension = os.path.splitext(file_name)[0]
return name_without_extension
def merge_csv_files(csv_files, output_xlsx):
all_dfs = []
for csv_file in csv_files:
df = pd.read_csv(csv_file)
# 添加日期列
date = os.path.basename(os.path.dirname(os.path.dirname(csv_file)))
df['出发日期'] = date
# 选择指定的列
selected_columns = [
'航班号','出发城市','到达城市', '航空公司', '出发日期', '出发时间', '到达时间',
'中转信息', 'economy_origin', '经济舱餐食信息', '经济舱座椅间距', '出发延误时间'
]
df = df[selected_columns]
# 重命名 'economy_origin' 为 '票价'
df = df.rename(columns={'economy_origin': '票价'})
all_dfs.append(df)
# 合并所有数据框
merged_df = pd.concat(all_dfs, ignore_index=True)
# 保存为Excel文件
merged_df.to_excel(output_xlsx, index=False, engine='openpyxl')
# 设置日期范围
start_date = datetime(2024, 11, 12)# 起始日期
end_date = datetime(2024, 11, 19)# 结束日期
clawer_date = datetime(2024, 11, 12)# 爬虫日期
# 设置输入和输出文件夹路径
input_base_path = "./"
output_folder = "./xlsx_output"
# 确保输出文件夹存在
if not os.path.exists(output_folder):
os.makedirs(output_folder)
# 用于存储同一始发地和目的地的CSV文件
route_files = {}
current_date = start_date
while current_date <= end_date:
folder_name = current_date.strftime("%Y-%m-%d")
folder_path = os.path.join(input_base_path, folder_name, clawer_date.strftime("%Y-%m-%d"))
if os.path.exists(folder_path):
for file_name in os.listdir(folder_path):
if file_name.endswith('.csv'):
csv_path = os.path.join(folder_path, file_name)
route = get_departure_destination(file_name)
if route not in route_files:
route_files[route] = []
route_files[route].append(csv_path)
current_date += timedelta(days=1)
# 合并并保存每个路线的文件
for route, files in route_files.items():
output_xlsx = os.path.join(output_folder, f"{route}.xlsx")
merge_csv_files(files, output_xlsx)
print(f"已合并并保存路线: {route} -> {output_xlsx}")
print("所有CSV文件已成功合并为XLSX文件并筛选了指定的列")

File diff suppressed because it is too large Load Diff

@ -0,0 +1,90 @@
import pandas as pd
import mysql.connector
from mysql.connector import Error
import os
from datetime import datetime, timedelta
# 数据库连接配置
db_config = {
'host': '152.136.166.253', # 修改这里,去掉端口号
'port': 8989, # 单独指定端口号
'database': 'fly_ticket',
'user': 'root',
'password': 'Cauc@2024'
}
def import_csv_to_db(file_path, cursor):
df = pd.read_csv(file_path)
for index, row in df.iterrows():
sql = """INSERT INTO flight (f_n, f_s_p, f_a_p, f_s_a, f_a_a, f_s_t, f_a_t, f_Date, f_Delay, f_p, f_food, f_wide, f_depcode, f_dstcode)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
ON DUPLICATE KEY UPDATE
f_s_p = VALUES(f_s_p),
f_a_p = VALUES(f_a_p),
f_s_a = VALUES(f_s_a),
f_a_a = VALUES(f_a_a),
f_s_t = VALUES(f_s_t),
f_a_t = VALUES(f_a_t),
f_Delay = VALUES(f_Delay),
f_p = VALUES(f_p),
f_food = VALUES(f_food),
f_wide = VALUES(f_wide),
f_depcode = VALUES(f_depcode),
f_dstcode = VALUES(f_dstcode);"""
values = (
row['航班号'],
row['出发城市'],
row['到达城市'],
row['出发机场'],
row['到达机场'],
row['出发时间'],
row['到达时间'],
row['出发日期'],
row['出发延误时间'],
row['economy_origin'],
row['经济舱餐食信息'],
row['经济舱座椅间距'],
row['出发机场三字码'],
row['到达机场三字码']
)
cursor.execute(sql, values)
try:
# 连接到数据库
conn = mysql.connector.connect(**db_config)
if conn.is_connected():
cursor = conn.cursor()
# 设置日期范围
start_date = datetime(2024, 11, 12)
end_date = datetime(2024, 11, 20)
current_date = start_date
while current_date <= end_date:
folder_name = current_date.strftime("%Y-%m-%d")
folder_path = os.path.join("D:\college\SE2\Ctrip-Crawler-main\Ctrip-Crawler-withComfortInfo", folder_name, "2024-11-12")
if os.path.exists(folder_path):
for file_name in os.listdir(folder_path):
if file_name.endswith('.csv'):
file_path = os.path.join(folder_path, file_name)
import_csv_to_db(file_path, cursor)
print(f"已导入文件: {file_path}")
current_date += timedelta(days=1)
# 提交更改
conn.commit()
print("所有数据成功插入到数据库")
except Error as e:
print(f"连接数据库时出错: {e}")
finally:
if 'conn' in locals() and conn.is_connected():
cursor.close()
conn.close()
print("数据库连接已关闭")

@ -0,0 +1,412 @@
import io
import os
import gzip
import time
import json
import random
import requests
import threading
import pandas as pd
from seleniumwire import webdriver
from datetime import datetime as dt,timedelta
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import TimeoutException,StaleElementReferenceException,ElementNotInteractableException,ElementClickInterceptedException # 加载异常
def getcitycode():
cityname,code=[],[]
#采用携程的api接口
city_url='https://flights.ctrip.com/online/api/poi/get?v='+str(random.random())
headers={
'dnt':'1',
'referer':'https://verify.ctrip.com/',
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'
}
r=requests.get(city_url,headers=headers)
citys=json.loads(r.text).get('data')
for city in citys:
if city =='热门':
continue
for key in city:
try:
for k in citys[city][key]:
cityname.append(k['display'])
code.append(k['data'])
except:
continue
citycode=dict(zip(cityname,code))
return cityname,citycode
class FLIGHT(object):
def __init__(self):
self.url = 'https://flights.ctrip.com/online/list/oneway' #携程机票查询页面
self.chromeDriverPath = 'C:/Program Files/Google/Chrome/Application/chromedriver' #chromedriver位置
self.options = webdriver.ChromeOptions() # 创建一个配置对象
#self.options.add_argument('--incognito') # 隐身模式(无痕模式)
#self.options.add_argument('User-Agent=%s'%UserAgent().random) # 替换User-Agent
self.options.add_argument("--disable-blink-features")
self.options.add_argument("--disable-blink-features=AutomationControlled")
self.options.add_experimental_option("excludeSwitches", ['enable-automation'])# 不显示正在受自动化软件控制
self.driver = webdriver.Chrome(executable_path=self.chromeDriverPath,chrome_options=self.options)
self.driver.maximize_window()
self.err=0#错误重试次数
def getpage(self):
##############获取地区码
self.startcode=self.citycode[self.city[0]][-3:]
self.endcode=self.citycode[self.city[1]][-3:]
##############生成访问链接
flights_url=self.url+'-'+self.startcode+'-'+self.endcode+'?&depdate='+self.date
print(flights_url)
##############设置加载超时阈值
self.driver.set_page_load_timeout(300)
try:
self.driver.get(flights_url)
except:
print('页面连接失败')
self.driver.close()
self.getpage()
else:
try:
##############判断是否存在验证码
self.driver.find_element(By.CLASS_NAME,"basic-alert.alert-giftinfo")
print('等待2小时后重试')
time.sleep(7200)
self.getpage()
except:
##############不存在验证码,执行下一步
self.remove_btn()
def remove_btn(self):
try:
js_remove="$('.notice-box').remove();"
self.driver.execute_script(js_remove)
except Exception as e:
print('防疫移除失败',e)
else:
self.changecity()
def changecity(self):
try:
#获取出发地与目的地元素位置
its=self.driver.find_elements(By.CLASS_NAME,'form-input-v3')
#若出发地与目标值不符,则更改出发地
while self.city[0] not in its[0].get_attribute('value'):
its[0].click()
time.sleep(0.5)
its[0].send_keys(Keys.CONTROL + 'a')
time.sleep(0.5)
its[0].send_keys(self.city[0])
time.sleep(0.5)
#若目的地与目标值不符,则更改目的地
while self.city[1] not in its[1].get_attribute('value'):
its[1].click()
time.sleep(0.5)
its[1].send_keys(Keys.CONTROL + 'a')
time.sleep(0.5)
its[1].send_keys(self.city[1])
time.sleep(0.5)
try:
#通过低价提醒按钮实现enter键换页
self.driver.implicitly_wait(5) # seconds
self.driver.find_elements(By.CLASS_NAME,'low-price-remind')[0].click()
except IndexError as e:
print('\n更换城市错误 找不到元素',e)
#以防万一
its[1].send_keys(Keys.ENTER)
print('\n更换城市成功',self.city[0]+'-'+self.city[1])
except (ElementNotInteractableException,StaleElementReferenceException,ElementClickInterceptedException,ElementClickInterceptedException) as e:
print('\n更换城市错误 元素错误',e)
self.err+=1
if self.err<=5:
self.click_btn()
else:
self.err=0
del self.driver.requests
self.getpage()
except Exception as e:
print('\n更换城市错误',e)
#删除本次请求
del self.driver.requests
#从头开始重新执行程序
self.getpage()
else:
#若无错误,执行下一步
self.err=0
self.getdata()
def getdata(self):
try:
#等待响应加载完成
self.predata = self.driver.wait_for_request('/international/search/api/search/batchSearch?.*', timeout=60)
rb=dict(json.loads(self.predata.body).get('flightSegments')[0])
except TimeoutException as e:
print('\获取数据错误',e)
#删除本次请求
del self.driver.requests
#从头开始重新执行程序
self.getpage()
else:
#检查数据获取正确性
if rb['departureCityName'] == self.city[0] and rb['arrivalCityName'] == self.city[1]:
print('城市获取正确')
#删除本次请求
del self.driver.requests
#若无错误,执行下一步
self.decode_data()
else:
#删除本次请求
del self.driver.requests
#重新更换城市
self.changecity()
def decode_data(self):
try:
buf = io.BytesIO(self.predata.response.body)
gf = gzip.GzipFile(fileobj = buf)
self.dedata = gf.read().decode('UTF-8')
self.dedata=json.loads(self.dedata)
except:
print('重新获取数据')
self.getpage()
else:
#若无错误,执行下一步
self.check_data()
def check_data(self):
try:
self.flightItineraryList=self.dedata['data']['flightItineraryList']
#倒序遍历,删除转机航班
for i in range(len(self.flightItineraryList)-1, -1, -1):
if self.flightItineraryList[i]['flightSegments'][0]['transferCount'] !=0:
self.flightItineraryList.pop(i)
if len(self.flightItineraryList):
#存在直航航班,执行下一步
self.muti_process()
else:
print('不存在直航航班')
return 0
except:
print('不存在直航航班')
return 0
def muti_process(self):
processes = []
self.flights = pd.DataFrame()
self.prices = pd.DataFrame()
#处理航班信息
processes.append(threading.Thread(target=self.proc_flightSegments))
#处理票价信息
processes.append(threading.Thread(target=self.proc_priceList))
for pro in processes:
pro.start()
for pro in processes:
pro.join()
#若无错误,执行下一步
self.mergedata()
def proc_flightSegments(self):
for flightlist in self.flightItineraryList:
flightlist=flightlist['flightSegments'][0]['flightList']
flightUnitList=dict(flightlist[0])
departureday=flightUnitList['departureDateTime'].split(' ')[0]
departuretime=flightUnitList['departureDateTime'].split(' ')[1]
arrivalday=flightUnitList['arrivalDateTime'].split(' ')[0]
arrivaltime=flightUnitList['arrivalDateTime'].split(' ')[1]
#删除一些不重要的信息
dellist=['sequenceNo', 'marketAirlineCode',
'departureProvinceId','departureCityId','departureCityCode','departureAirportShortName','departureTerminal',
'arrivalProvinceId','arrivalCityId','arrivalCityCode','arrivalAirportShortName','arrivalTerminal',
'transferDuration','stopList','leakedVisaTagSwitch','trafficType','highLightPlaneNo','mealType',
'operateAirlineCode','arrivalDateTime','departureDateTime','operateFlightNo','operateAirlineName']
for value in dellist:
try:
flightUnitList.pop(value)
except:
continue
#更新日期格式
flightUnitList.update({'departureday': departureday, 'departuretime': departuretime,
'arrivalday': arrivalday, 'arrivaltime': arrivaltime})
self.flights=pd.concat([self.flights,pd.DataFrame(flightUnitList,index=[0])],ignore_index=True)
def proc_priceList(self):
for flightlist in self.flightItineraryList:
flightNo=flightlist['itineraryId'].split('_')[0]
priceList=flightlist['priceList']
#经济舱,经济舱折扣
economy,economy_discount=[],[]
#商务舱,商务舱折扣
bussiness,bussiness_discount=[],[]
for price in priceList:
adultPrice=price['adultPrice']
cabin=price['cabin']
priceUnitList=dict(price['priceUnitList'][0]['flightSeatList'][0])
discountRate=priceUnitList['discountRate']
#经济舱
if cabin=='Y':
economy.append(adultPrice)
economy_discount.append(discountRate)
#商务舱
elif cabin=='C':
bussiness.append(adultPrice)
bussiness_discount.append(discountRate)
if economy !=[]:
try:
economy_origin=economy[economy_discount.index(1)]
except:
economy_origin=int(max(economy)/max(economy_discount))
if min(economy_discount) !=1:
economy_low=min(economy)
economy_cut=min(economy_discount)
else:
economy_low=''
economy_cut=''
else:
economy_origin=''
economy_low=''
economy_cut=''
if bussiness !=[]:
try:
bussiness_origin=bussiness[bussiness_discount.index(1)]
except:
bussiness_origin=int(max(bussiness)/max(bussiness_discount))
if min(bussiness_discount) !=1:
bussiness_low=min(bussiness)
bussiness_cut=min(bussiness_discount)
else:
bussiness_low=''
bussiness_cut=''
else:
bussiness_origin=''
bussiness_low=''
bussiness_cut=''
price_info={'flightNo':flightNo,
'economy_origin':economy_origin,'economy_low':economy_low,'economy_cut':economy_cut,
'bussiness_origin':bussiness_origin,'bussiness_low':bussiness_low,'bussiness_cut':bussiness_cut}
#self.prices=self.prices.append(price_info,ignore_index=True)
self.prices=pd.concat([self.prices,pd.DataFrame(price_info,index=[0])],ignore_index=True)
def mergedata(self):
try:
self.df = self.flights.merge(self.prices,on=['flightNo'])
self.df['数据获取日期']=dt.now().strftime('%Y-%m-%d')
#对pandas的columns进行重命名
order=['数据获取日期','航班号','航空公司',
'出发日期','出发时间','到达日期','到达时间','飞行时长','出发国家','出发城市','出发机场','出发机场三字码',
'到达国家','到达城市','到达机场','到达机场三字码','飞机型号','飞机尺寸','飞机型号三字码',
'经济舱原价','经济舱最低价','经济舱折扣','商务舱原价','商务舱最低价','商务舱折扣',
'到达准点率','停留次数']
origin=['数据获取日期','flightNo','marketAirlineName',
'departureday','departuretime','arrivalday','arrivaltime','duration',
'departureCountryName','departureCityName','departureAirportName','departureAirportCode',
'arrivalCountryName','arrivalCityName','arrivalAirportName','arrivalAirportCode',
'aircraftName','aircraftSize','aircraftCode',
'economy_origin','economy_low','economy_cut',
'bussiness_origin','bussiness_low','bussiness_cut',
'arrivalPunctuality','stopCount']
columns=dict(zip(origin,order))
self.df=self.df.rename(columns=columns)
self.df = self.df[order]
if not os.path.exists(self.date):
os.makedirs(self.date)
filename=os.getcwd()+'\\'+self.date+'\\'+self.date+'-'+self.city[0]+'-'+self.city[1]+'.csv'
self.df.to_csv(filename,encoding='GB18030',index=False)
print('\n数据爬取完成',filename)
except Exception as e:
print('合并数据失败',e)
def demain(self,citys,citycode):
self.citycode=citycode
#设置出发日期
self.date=dt.now()+timedelta(days=7)
self.date=self.date.strftime('%Y-%m-%d')
for city in citys:
self.city=city
if citys.index(city)==0:
#第一次运行
self.getpage()
else:
#后续运行只需更换出发与目的地
self.changecity()
#运行结束退出
self.driver.quit()
if __name__ == '__main__':
citys=[]
cityname,citycode=getcitycode()
city=['上海','广州','深圳','北京']
ytic=list(reversed(city))
for m in city:
for n in ytic:
if m==n:
continue
else:
citys.append([m,n])
fly = FLIGHT()
fly.demain(citys,citycode)
print('\n程序运行完成!!!!')

@ -0,0 +1,397 @@
import io
import os
import gzip
import time
import json
import threading
import pandas as pd
from seleniumwire import webdriver
from datetime import datetime as dt,timedelta
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import TimeoutException,StaleElementReferenceException,ElementNotInteractableException,ElementClickInterceptedException # 加载异常
class FLIGHT(object):
def __init__(self):
self.chromeDriverPath = 'C:/Program Files/Google/Chrome/Application/chromedriver' #chromedriver位置
self.options = webdriver.ChromeOptions() # 创建一个配置对象
self.options.add_argument('--incognito') # 隐身模式(无痕模式)
self.options.add_argument("--disable-blink-features")
self.options.add_argument("--disable-blink-features=AutomationControlled")
self.options.add_experimental_option("excludeSwitches", ['enable-automation'])# 不显示正在受自动化软件控制
self.driver = webdriver.Chrome(executable_path=self.chromeDriverPath,chrome_options=self.options)
self.driver.set_page_load_timeout(300)#设置加载超时阈值
self.driver.maximize_window()
self.err=0#错误重试次数
#前往首页
self.driver.get('https://flights.ctrip.com/online/channel/domestic')
def getpage(self):
try:
self.driver.find_element(By.CLASS_NAME,'pc_home-jipiao').click()#点击飞机图标,返回主界面
self.driver.implicitly_wait(5) # seconds
self.driver.find_elements(By.CLASS_NAME,'radio-label')[0].click()#单程
while self.driver.find_elements(By.CSS_SELECTOR,"[aria-label=请选择日期]")[0].get_attribute("value") != self.date:
self.driver.find_element(By.CLASS_NAME,'modifyDate.depart-date').click()#点击日期选择
for m in self.driver.find_elements(By.CLASS_NAME,'date-picker.date-picker-block'):
if int(m.find_element(By.CLASS_NAME,'month').text[:-1]) != int(self.date[5:7]):
continue
for d in m.find_elements(By.CLASS_NAME,'date-d'):
if int(d.text) == int(self.date[-2:]):
d.click()
break
self.driver.find_element(By.CLASS_NAME,'search-btn').click()#搜索
except:
print('页面连接失败')
self.driver.close()
self.getpage()
else:
try:
##############判断是否存在验证码
self.driver.find_element(By.ID,"verification-code")
print('等待2小时后重试')
time.sleep(7200)
self.getpage()
except:
##############不存在验证码,执行下一步
self.changecity()
def remove_btn(self):
try:
js_remove="$('.notice-box').remove();"
self.driver.execute_script(js_remove)
except Exception as e:
print('防疫移除失败',e)
def changecity(self):
#移除防疫提醒
self.remove_btn()
try:
#获取出发地与目的地元素位置
its=self.driver.find_elements(By.CLASS_NAME,'form-input-v3')
#若出发地与目标值不符,则更改出发地
while self.city[0] not in its[0].get_attribute('value'):
its[0].click()
time.sleep(0.5)
its[0].send_keys(Keys.CONTROL + 'a')
time.sleep(0.5)
its[0].send_keys(self.city[0])
time.sleep(0.5)
#若目的地与目标值不符,则更改目的地
while self.city[1] not in its[1].get_attribute('value'):
its[1].click()
time.sleep(0.5)
its[1].send_keys(Keys.CONTROL + 'a')
time.sleep(0.5)
its[1].send_keys(self.city[1])
time.sleep(0.5)
try:
#通过低价提醒按钮实现enter键换页
self.driver.implicitly_wait(5) # seconds
self.driver.find_elements(By.CLASS_NAME,'low-price-remind')[0].click()
except IndexError as e:
print('\n更换城市错误 找不到元素',e)
#以防万一
its[1].send_keys(Keys.ENTER)
print('\n更换城市成功',self.city[0]+'-'+self.city[1])
#捕获错误
except (IndexError,ElementNotInteractableException,StaleElementReferenceException,ElementClickInterceptedException,ElementClickInterceptedException) as e:
print('\n更换城市错误 元素错误',e)
self.err+=1
if self.err<=5:
self.changecity()
else:
self.err=0
del self.driver.requests
self.getpage()
except Exception as e:
print('\n更换城市错误',e)
#删除本次请求
del self.driver.requests
#从头开始重新执行程序
self.getpage()
else:
#若无错误,执行下一步
self.err=0
self.getdata()
def getdata(self):
try:
#等待响应加载完成
self.predata = self.driver.wait_for_request('/international/search/api/search/batchSearch?.*', timeout=30)
rb=dict(json.loads(self.predata.body).get('flightSegments')[0])
except TimeoutException as e:
print('\获取数据错误',e)
#删除本次请求
del self.driver.requests
#从头开始重新执行程序
self.getpage()
else:
#检查数据获取正确性
if rb['departureCityName'] == self.city[0] and rb['arrivalCityName'] == self.city[1]:
print('城市获取正确')
#删除本次请求
del self.driver.requests
#若无错误,执行下一步
self.decode_data()
else:
#删除本次请求
del self.driver.requests
#重新更换城市
self.changecity()
def decode_data(self):
try:
buf = io.BytesIO(self.predata.response.body)
gf = gzip.GzipFile(fileobj = buf)
self.dedata = gf.read().decode('UTF-8')
self.dedata=json.loads(self.dedata)
except:
print('重新获取数据')
self.getpage()
else:
#若无错误,执行下一步
self.check_data()
def check_data(self):
try:
self.flightItineraryList=self.dedata['data']['flightItineraryList']
#倒序遍历,删除转机航班
for i in range(len(self.flightItineraryList)-1, -1, -1):
if self.flightItineraryList[i]['flightSegments'][0]['transferCount'] !=0:
self.flightItineraryList.pop(i)
if len(self.flightItineraryList):
#存在直航航班,执行下一步
self.muti_process()
else:
print('不存在直航航班')
return 0
except:
print('不存在直航航班')
return 0
def muti_process(self):
processes = []
self.flights = pd.DataFrame()
self.prices = pd.DataFrame()
#处理航班信息
processes.append(threading.Thread(target=self.proc_flightSegments))
#处理票价信息
processes.append(threading.Thread(target=self.proc_priceList))
for pro in processes:
pro.start()
for pro in processes:
pro.join()
#若无错误,执行下一步
self.mergedata()
def proc_flightSegments(self):
for flightlist in self.flightItineraryList:
flightlist=flightlist['flightSegments'][0]['flightList']
flightUnitList=dict(flightlist[0])
departureday=flightUnitList['departureDateTime'].split(' ')[0]
departuretime=flightUnitList['departureDateTime'].split(' ')[1]
arrivalday=flightUnitList['arrivalDateTime'].split(' ')[0]
arrivaltime=flightUnitList['arrivalDateTime'].split(' ')[1]
#删除一些不重要的信息
dellist=['sequenceNo', 'marketAirlineCode',
'departureProvinceId','departureCityId','departureCityCode','departureAirportShortName','departureTerminal',
'arrivalProvinceId','arrivalCityId','arrivalCityCode','arrivalAirportShortName','arrivalTerminal',
'transferDuration','stopList','leakedVisaTagSwitch','trafficType','highLightPlaneNo','mealType',
'operateAirlineCode','arrivalDateTime','departureDateTime','operateFlightNo','operateAirlineName']
for value in dellist:
try:
flightUnitList.pop(value)
except:
continue
#更新日期格式
flightUnitList.update({'departureday': departureday, 'departuretime': departuretime,
'arrivalday': arrivalday, 'arrivaltime': arrivaltime})
self.flights=pd.concat([self.flights,pd.DataFrame(flightUnitList,index=[0])],ignore_index=True)
def proc_priceList(self):
for flightlist in self.flightItineraryList:
flightNo=flightlist['itineraryId'].split('_')[0]
priceList=flightlist['priceList']
#经济舱,经济舱折扣
economy,economy_discount=[],[]
#商务舱,商务舱折扣
bussiness,bussiness_discount=[],[]
for price in priceList:
adultPrice=price['adultPrice']
cabin=price['cabin']
priceUnitList=dict(price['priceUnitList'][0]['flightSeatList'][0])
discountRate=priceUnitList['discountRate']
#经济舱
if cabin=='Y':
economy.append(adultPrice)
economy_discount.append(discountRate)
#商务舱
elif cabin=='C':
bussiness.append(adultPrice)
bussiness_discount.append(discountRate)
if economy !=[]:
try:
economy_origin=economy[economy_discount.index(1)]
except:
economy_origin=int(max(economy)/max(economy_discount))
if min(economy_discount) !=1:
economy_low=min(economy)
economy_cut=min(economy_discount)
else:
economy_low=''
economy_cut=''
else:
economy_origin=''
economy_low=''
economy_cut=''
if bussiness !=[]:
try:
bussiness_origin=bussiness[bussiness_discount.index(1)]
except:
bussiness_origin=int(max(bussiness)/max(bussiness_discount))
if min(bussiness_discount) !=1:
bussiness_low=min(bussiness)
bussiness_cut=min(bussiness_discount)
else:
bussiness_low=''
bussiness_cut=''
else:
bussiness_origin=''
bussiness_low=''
bussiness_cut=''
price_info={'flightNo':flightNo,
'economy_origin':economy_origin,'economy_low':economy_low,'economy_cut':economy_cut,
'bussiness_origin':bussiness_origin,'bussiness_low':bussiness_low,'bussiness_cut':bussiness_cut}
#self.prices=self.prices.append(price_info,ignore_index=True)
self.prices=pd.concat([self.prices,pd.DataFrame(price_info,index=[0])],ignore_index=True)
def mergedata(self):
try:
self.df = self.flights.merge(self.prices,on=['flightNo'])
self.df['数据获取日期']=dt.now().strftime('%Y-%m-%d')
#对pandas的columns进行重命名
order=['数据获取日期','航班号','航空公司',
'出发日期','出发时间','到达日期','到达时间','飞行时长','出发国家','出发城市','出发机场','出发机场三字码',
'到达国家','到达城市','到达机场','到达机场三字码','飞机型号','飞机尺寸','飞机型号三字码',
'经济舱原价','经济舱最低价','经济舱折扣','商务舱原价','商务舱最低价','商务舱折扣',
'到达准点率','停留次数']
origin=['数据获取日期','flightNo','marketAirlineName',
'departureday','departuretime','arrivalday','arrivaltime','duration',
'departureCountryName','departureCityName','departureAirportName','departureAirportCode',
'arrivalCountryName','arrivalCityName','arrivalAirportName','arrivalAirportCode',
'aircraftName','aircraftSize','aircraftCode',
'economy_origin','economy_low','economy_cut',
'bussiness_origin','bussiness_low','bussiness_cut',
'arrivalPunctuality','stopCount']
columns=dict(zip(origin,order))
self.df=self.df.rename(columns=columns)
self.df = self.df[order]
if not os.path.exists(self.date):
os.makedirs(self.date)
filename=os.getcwd()+'\\'+self.date+'\\'+self.date+'-'+self.city[0]+'-'+self.city[1]+'.csv'
self.df.to_csv(filename,encoding='GB18030',index=False)
print('\n数据爬取完成',filename)
except Exception as e:
print('合并数据失败',e)
def demain(self,citys):
#设置出发日期
self.date=dt.now()+timedelta(days=1)
self.date=self.date.strftime('%Y-%m-%d')
for city in citys:
self.city=city
if citys.index(city)==0:
#第一次运行
self.getpage()
else:
#后续运行只需更换出发与目的地
self.changecity()
#运行结束退出
self.driver.quit()
if __name__ == '__main__':
citys=[]
city=['上海','广州','深圳','北京']
#形成城市对
ytic=list(reversed(city))
for m in city:
for n in ytic:
if m==n:
continue
else:
citys.append([m,n])
fly = FLIGHT()
fly.demain(citys)
print('\n程序运行完成!!!!')

@ -0,0 +1,143 @@
import requests
import datetime
import re
import demjson
import time
import pandas as pd
def create_assist_date(datestart = None,dateend = None):
# 创建日期辅助表
if datestart is None:
datestart = '2020-01-01'
if dateend is None:
dateend = (datetime.datetime.now()+datetime.timedelta(days=-1)).strftime('%Y-%m-%d')
# 转为日期格式
datestart=datetime.datetime.strptime(datestart,'%Y-%m-%d')
dateend=datetime.datetime.strptime(dateend,'%Y-%m-%d')
date_list = []
date_list.append(datestart.strftime('%Y-%m-%d'))
while datestart<dateend:
# 日期叠加一天
datestart+=datetime.timedelta(days=+1)
# 日期转字符串存入列表
date_list.append(datestart.strftime('%Y-%m-%d'))
return date_list
def getdata(citys,dateseries):
url='https://www.lsjpjg.com/getthis.php'
headers={
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Host': 'www.lsjpjg.com',
'Origin': 'https://www.lsjpjg.com',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4647.116 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest'
}
for city in citys:
df=pd.DataFrame()
err=0
for date in dateseries:
data={'dep_dt': date,'dep_ct': city[0],'arr_ct': city[1]}
res=requests.post(url, headers=headers,data=data)
#判断航线是否一直不存在
if res.text=='\ufeff[]' :
print(city,'无航班',date)
err+=1
#数量超过阈值则中断该航线
if err>30:
break
continue
else:
err-=1
print(city,date)
res.encoding=res.apparent_encoding
NewResponse = re.sub(r"/","",res.text)
try:
r=NewResponse.encode('utf-8')
j=demjson.decode(r)
except:
continue
temp=pd.DataFrame(j)
try:
temp.drop('icon',axis=1,inplace=True)
temp['出发日期']=date
except:
continue
df=pd.concat([df,temp])
time.sleep(0.5)
filename=city[0]+'-'+city[1]
#处理原始数据
proc_data(filename,df,interval=8)
def proc_data(filename,df,interval=8):
#保存原始数据至本地
df.to_csv(filename+'.csv',encoding='GB18030')
df['全票价']=0
df['日期差']=None
for i in df.index:
try:
if not '经济' in df['discount'][i]:
df.drop(index=i,inplace=True)
elif '' in df['discount'][i]:
#判断出发日期与查询日期之间的间隔是否大于阈值
delta=datetime.datetime.strptime(df['出发日期'][i],'%Y-%m-%d')-datetime.datetime.strptime(df['qry_dt'][i],'%Y-%m-%d')
if delta.days >interval:
df.drop(index=i,inplace=True)
continue
else:
df.loc[i,'日期差']=delta.days
#通过折扣率计算全票价
discount=float(re.findall('\d+\.?\d*',df['discount'][i])[0])
full_price=df['price'][i]/discount*10
df.loc[i,'全票价']=full_price
elif ('全价'or'经典') in df['discount'][i]:
#判断出发日期与查询日期之间的间隔是否大于阈值
delta=datetime.datetime.strptime(df['出发日期'][i],'%Y-%m-%d')-datetime.datetime.strptime(df['qry_dt'][i],'%Y-%m-%d')
if delta.days >interval:
df.drop(index=i,inplace=True)
continue
else:
df.loc[i,'日期差']=delta.days
#全票价
full_price=df['price'][i]
df.loc[i,'全票价']=full_price
except:
df.drop(index=i,inplace=True)
avg_full_price=df[df['全票价']!=0].groupby(['出发日期'])[['全票价']].mean()
avg_price=df[df['全票价']!=df['price']].groupby(['出发日期'])[['price']].mean()
result=pd.concat([avg_price,avg_full_price],axis=1)
result['折扣']=result['price']/result['全票价']
#将处理后的数据保存至本地
result.to_csv(result+'-'+filename+'.csv',encoding='GB18030')
if __name__ == '__main__':
citys=[]
#设置开始与结束日期
dateseries=create_assist_date(datestart = None,dateend = None)
city=['上海','广州','深圳','北京']
ytic=list(reversed(city))
for m in city:
for n in ytic:
if m==n:
continue
else:
citys.append([m,n])
getdata(citys,dateseries)

@ -0,0 +1,17 @@
# must be unique in a given SonarQube instance
sonar.projectKey=clawer
# --- optional properties ---
# defaults to project key
sonar.projectName=clawer
# defaults to 'not provided'
#sonar.projectVersion=1.0
# Path is relative to the sonar-project.properties file. Defaults to .
#sonar.sources=src,WebContent
# Encoding of the source code. Default is default system encoding
sonar.sourceEncoding=UTF-8
#sonar.java.binaries=target/classes/javabean,target/classes/servlet

File diff suppressed because one or more lines are too long
Loading…
Cancel
Save