|
|
@ -16,13 +16,13 @@ from datetime import datetime
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 爬取的城市
|
|
|
|
# 爬取的城市
|
|
|
|
crawal_citys = ["北京", "泉州"]
|
|
|
|
crawal_citys = ["天津", "贵阳"]
|
|
|
|
|
|
|
|
|
|
|
|
# 爬取日期范围:起始日期。格式'2023-12-01'
|
|
|
|
# 爬取日期范围:起始日期。格式'2023-12-01'
|
|
|
|
begin_date = "2024-10-13"
|
|
|
|
begin_date = "2024-10-21"
|
|
|
|
|
|
|
|
|
|
|
|
# 爬取日期范围:结束日期。格式'2023-12-31'
|
|
|
|
# 爬取日期范围:结束日期。格式'2023-12-31'
|
|
|
|
end_date = "2024-10-15"
|
|
|
|
end_date = "2024-10-22"
|
|
|
|
|
|
|
|
|
|
|
|
# 爬取T+N,即N天后
|
|
|
|
# 爬取T+N,即N天后
|
|
|
|
start_interval = 1
|
|
|
|
start_interval = 1
|
|
|
@ -43,7 +43,7 @@ max_wait_time = 10
|
|
|
|
max_retry_time = 5
|
|
|
|
max_retry_time = 5
|
|
|
|
|
|
|
|
|
|
|
|
# 是否只抓取直飞信息(True: 只抓取直飞,False: 抓取所有航班)
|
|
|
|
# 是否只抓取直飞信息(True: 只抓取直飞,False: 抓取所有航班)
|
|
|
|
direct_flight = True
|
|
|
|
direct_flight = False
|
|
|
|
|
|
|
|
|
|
|
|
# 是否删除不重要的信息
|
|
|
|
# 是否删除不重要的信息
|
|
|
|
del_info = False
|
|
|
|
del_info = False
|
|
|
@ -116,7 +116,7 @@ def init_driver():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def gen_citys(crawal_citys):
|
|
|
|
def gen_citys(crawal_citys):
|
|
|
|
# 生成城市组合列表
|
|
|
|
# 生成城市组合表
|
|
|
|
citys = []
|
|
|
|
citys = []
|
|
|
|
ytic = list(reversed(crawal_citys))
|
|
|
|
ytic = list(reversed(crawal_citys))
|
|
|
|
for m in crawal_citys:
|
|
|
|
for m in crawal_citys:
|
|
|
@ -179,6 +179,7 @@ class DataFetcher(object):
|
|
|
|
self.city = None
|
|
|
|
self.city = None
|
|
|
|
self.err = 0 # 错误重试次数
|
|
|
|
self.err = 0 # 错误重试次数
|
|
|
|
self.switch_acc = 0 #切换账户
|
|
|
|
self.switch_acc = 0 #切换账户
|
|
|
|
|
|
|
|
self.comfort_data = None # 新添加的属性
|
|
|
|
|
|
|
|
|
|
|
|
def refresh_driver(self):
|
|
|
|
def refresh_driver(self):
|
|
|
|
try:
|
|
|
|
try:
|
|
|
@ -217,7 +218,7 @@ class DataFetcher(object):
|
|
|
|
# 移除分享链接
|
|
|
|
# 移除分享链接
|
|
|
|
self.driver.execute_script("document.querySelectorAll('.shareline').forEach(element => element.remove());")
|
|
|
|
self.driver.execute_script("document.querySelectorAll('.shareline').forEach(element => element.remove());")
|
|
|
|
'''
|
|
|
|
'''
|
|
|
|
# 使用JavaScript删除所有的<dl>标签
|
|
|
|
# 使用JavaScript除有的<dl>标签
|
|
|
|
self.driver.execute_script("""
|
|
|
|
self.driver.execute_script("""
|
|
|
|
var elements = document.getElementsByTagName('dl');
|
|
|
|
var elements = document.getElementsByTagName('dl');
|
|
|
|
while(elements.length > 0){
|
|
|
|
while(elements.length > 0){
|
|
|
@ -233,17 +234,22 @@ class DataFetcher(object):
|
|
|
|
def check_verification_code(self):
|
|
|
|
def check_verification_code(self):
|
|
|
|
try:
|
|
|
|
try:
|
|
|
|
# 检查是否有验证码元素,如果有,则需要人工处理
|
|
|
|
# 检查是否有验证码元素,如果有,则需要人工处理
|
|
|
|
if (len(self.driver.find_elements(By.ID, "verification-code"))+len(self.driver.find_elements(By.CLASS_NAME, "alert-title"))):
|
|
|
|
if (len(self.driver.find_elements(By.ID, "verification-code")) + len(self.driver.find_elements(By.CLASS_NAME, "alert-title"))):
|
|
|
|
print(
|
|
|
|
print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} check_verification_code:验证码被触发verification-code/alert-title,请手动完成验证。')
|
|
|
|
f'{time.strftime("%Y-%m-%d_%H-%M-%S")} check_verification_code:验证码被触发verification-code/alert-title,等待{crawal_interval*100}后重试。'
|
|
|
|
|
|
|
|
|
|
|
|
# 等待用户手动处理验证码
|
|
|
|
|
|
|
|
input("请完成验证码,然后按回车键继续...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 等待页面加载完成
|
|
|
|
|
|
|
|
WebDriverWait(self.driver, max_wait_time).until(
|
|
|
|
|
|
|
|
EC.presence_of_element_located((By.CLASS_NAME, "pc_home-jipiao"))
|
|
|
|
)
|
|
|
|
)
|
|
|
|
self.driver.quit()
|
|
|
|
|
|
|
|
time.sleep(crawal_interval*100)
|
|
|
|
print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} check_verification_code:验证码处理完成,继续执行。')
|
|
|
|
self.driver = init_driver()
|
|
|
|
|
|
|
|
self.err = 0
|
|
|
|
# 移除注意事项
|
|
|
|
self.switch_acc += 1
|
|
|
|
self.remove_btn()
|
|
|
|
self.get_page(1)
|
|
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
# 移除注意事项
|
|
|
|
# 移除注意事项
|
|
|
|
self.remove_btn()
|
|
|
|
self.remove_btn()
|
|
|
@ -253,6 +259,7 @@ class DataFetcher(object):
|
|
|
|
print(
|
|
|
|
print(
|
|
|
|
f'{time.strftime("%Y-%m-%d_%H-%M-%S")} check_verification_code:未知错误,错误类型:{type(e).__name__}, 详细错误信息:{str(e).split("Stacktrace:")[0]}'
|
|
|
|
f'{time.strftime("%Y-%m-%d_%H-%M-%S")} check_verification_code:未知错误,错误类型:{type(e).__name__}, 详细错误信息:{str(e).split("Stacktrace:")[0]}'
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
def login(self):
|
|
|
|
def login(self):
|
|
|
|
if login_allowed:
|
|
|
|
if login_allowed:
|
|
|
@ -647,7 +654,7 @@ class DataFetcher(object):
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
print(
|
|
|
|
print(
|
|
|
|
f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 错误次数【{self.err}-{max_retry_time}】,change_city:更换城市和日期失败,错误类型:{type(e).__name__}, 详细错误信息:{str(e).split("Stacktrace:")[0]}'
|
|
|
|
f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 错误次数【{self.err}-{max_retry_time}】,change_city:更换市和日期失败,错误类型:{type(e).__name__}, 详细错误信息:{str(e).split("Stacktrace:")[0]}'
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
# 检查注意事项和验证码
|
|
|
|
# 检查注意事项和验证码
|
|
|
@ -689,11 +696,12 @@ class DataFetcher(object):
|
|
|
|
self.predata = self.driver.wait_for_request(
|
|
|
|
self.predata = self.driver.wait_for_request(
|
|
|
|
"/international/search/api/search/batchSearch?.*", timeout=max_wait_time
|
|
|
|
"/international/search/api/search/batchSearch?.*", timeout=max_wait_time
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
rb = dict(json.loads(self.predata.body).get("flightSegments")[0])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 捕获 getFlightComfort 数据
|
|
|
|
# 捕获 getFlightComfort 数据
|
|
|
|
self.comfort_data = self.capture_flight_comfort_data()
|
|
|
|
self.comfort_data = self.capture_flight_comfort_data()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
rb = dict(json.loads(self.predata.body).get("flightSegments")[0])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
except Exception as e:
|
|
|
|
# 错误次数+1
|
|
|
|
# 错误次数+1
|
|
|
@ -761,7 +769,7 @@ class DataFetcher(object):
|
|
|
|
f'screenshot/screenshot_{time.strftime("%Y-%m-%d_%H-%M-%S")}.png'
|
|
|
|
f'screenshot/screenshot_{time.strftime("%Y-%m-%d_%H-%M-%S")}.png'
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
# 重新更换城市
|
|
|
|
# 重新更换城
|
|
|
|
print(
|
|
|
|
print(
|
|
|
|
f'{time.strftime("%Y-%m-%d_%H-%M-%S")} get_data:重新更换城市:{rb["departureCityName"]}-{rb["arrivalCityName"]}-{rb["departureDate"]}'
|
|
|
|
f'{time.strftime("%Y-%m-%d_%H-%M-%S")} get_data:重新更换城市:{rb["departureCityName"]}-{rb["arrivalCityName"]}-{rb["departureDate"]}'
|
|
|
|
)
|
|
|
|
)
|
|
|
@ -813,7 +821,7 @@ class DataFetcher(object):
|
|
|
|
|
|
|
|
|
|
|
|
# 检查注意事项和验证码
|
|
|
|
# 检查注意事项和验证码
|
|
|
|
if self.check_verification_code():
|
|
|
|
if self.check_verification_code():
|
|
|
|
# 重试
|
|
|
|
# 试
|
|
|
|
self.get_data()
|
|
|
|
self.get_data()
|
|
|
|
# 判错误次数
|
|
|
|
# 判错误次数
|
|
|
|
if self.err >= max_retry_time:
|
|
|
|
if self.err >= max_retry_time:
|
|
|
@ -830,7 +838,7 @@ class DataFetcher(object):
|
|
|
|
# 重置错误计数
|
|
|
|
# 重置错误计数
|
|
|
|
self.err = 0
|
|
|
|
self.err = 0
|
|
|
|
|
|
|
|
|
|
|
|
# 若无错误,执行下一步
|
|
|
|
# 若无误,执行下一步
|
|
|
|
self.check_data()
|
|
|
|
self.check_data()
|
|
|
|
|
|
|
|
|
|
|
|
def check_data(self):
|
|
|
|
def check_data(self):
|
|
|
@ -1041,9 +1049,43 @@ class DataFetcher(object):
|
|
|
|
def mergedata(self):
|
|
|
|
def mergedata(self):
|
|
|
|
try:
|
|
|
|
try:
|
|
|
|
self.df = self.flights.merge(self.prices, on=["flightNo"])
|
|
|
|
self.df = self.flights.merge(self.prices, on=["flightNo"])
|
|
|
|
|
|
|
|
print(f"合并后的航班数据形状: {self.df.shape}")
|
|
|
|
|
|
|
|
print(f"合并后的航班数据列: {self.df.columns}")
|
|
|
|
|
|
|
|
|
|
|
|
self.df["dateGetTime"] = dt.now().strftime("%Y-%m-%d")
|
|
|
|
self.df["dateGetTime"] = dt.now().strftime("%Y-%m-%d")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print(f"获取到的舒适度数据: {self.comfort_data}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if self.comfort_data:
|
|
|
|
|
|
|
|
comfort_df = pd.DataFrame.from_dict(self.comfort_data, orient='index')
|
|
|
|
|
|
|
|
comfort_df.reset_index(inplace=True)
|
|
|
|
|
|
|
|
comfort_df.rename(columns={'index': 'flight_no'}, inplace=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print(f"舒适度数据形状: {comfort_df.shape}")
|
|
|
|
|
|
|
|
print(f"舒适度数据列: {comfort_df.columns}")
|
|
|
|
|
|
|
|
print(f"舒适度数据前几行: \n{comfort_df.head()}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 检查 operateFlightNo 列是否存在
|
|
|
|
|
|
|
|
if 'operateFlightNo' in self.df.columns:
|
|
|
|
|
|
|
|
print(f"合并前的 operateFlightNo 唯一值: {self.df['operateFlightNo'].unique()}")
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
print("警告: operateFlightNo 列不存在于数据中")
|
|
|
|
|
|
|
|
print(f"现有的列: {self.df.columns}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print(f"合并前的 flight_no 唯一值: {comfort_df['flight_no'].unique()}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 创建一个临时列来存储用于匹配的航班号
|
|
|
|
|
|
|
|
self.df['match_flight_no'] = self.df['operateFlightNo'].fillna(self.df['flightNo'])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 使用 left join 来合并数据
|
|
|
|
|
|
|
|
self.df = self.df.merge(comfort_df, left_on='match_flight_no', right_on='flight_no', how='left')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print(f"合并后的数据形状: {self.df.shape}")
|
|
|
|
|
|
|
|
print(f"合并后的数据列: {self.df.columns}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 删除临时列和多余的flight_no列
|
|
|
|
|
|
|
|
self.df.drop(['match_flight_no', 'flight_no'], axis=1, inplace=True)
|
|
|
|
|
|
|
|
|
|
|
|
if rename_col:
|
|
|
|
if rename_col:
|
|
|
|
# 对pandas的columns进行重命名
|
|
|
|
# 对pandas的columns进行重命名
|
|
|
|
order = [
|
|
|
|
order = [
|
|
|
@ -1096,15 +1138,33 @@ class DataFetcher(object):
|
|
|
|
|
|
|
|
|
|
|
|
columns = dict(zip(origin, order))
|
|
|
|
columns = dict(zip(origin, order))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 添加舒适度数据的列名映射
|
|
|
|
|
|
|
|
comfort_columns = {
|
|
|
|
|
|
|
|
'departure_delay_time': '出发延误时间',
|
|
|
|
|
|
|
|
'departure_bridge_rate': '出发廊桥率',
|
|
|
|
|
|
|
|
'arrival_delay_time': '到达延误时间',
|
|
|
|
|
|
|
|
'plane_type': '飞机类型',
|
|
|
|
|
|
|
|
'plane_width': '飞机宽度',
|
|
|
|
|
|
|
|
'plane_age': '飞机机龄',
|
|
|
|
|
|
|
|
'Y_has_meal': '经济舱是否有餐食',
|
|
|
|
|
|
|
|
'Y_seat_tilt': '经济舱座椅倾斜度',
|
|
|
|
|
|
|
|
'Y_seat_width': '经济舱座椅宽度',
|
|
|
|
|
|
|
|
'Y_seat_pitch': '经济舱座椅间距',
|
|
|
|
|
|
|
|
'Y_meal_msg': '经济舱餐食信息',
|
|
|
|
|
|
|
|
'Y_power': '经济舱电源',
|
|
|
|
|
|
|
|
'C_has_meal': '商务舱是否有餐食',
|
|
|
|
|
|
|
|
'C_seat_tilt': '商务舱座椅倾斜度',
|
|
|
|
|
|
|
|
'C_seat_width': '商务舱座椅宽度',
|
|
|
|
|
|
|
|
'C_seat_pitch': '商务舱座椅间距',
|
|
|
|
|
|
|
|
'C_meal_msg': '商务舱餐食信息',
|
|
|
|
|
|
|
|
'C_power': '商务舱电源',
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
columns.update(comfort_columns)
|
|
|
|
|
|
|
|
|
|
|
|
self.df = self.df.rename(columns=columns)
|
|
|
|
self.df = self.df.rename(columns=columns)
|
|
|
|
|
|
|
|
|
|
|
|
if del_info:
|
|
|
|
if del_info:
|
|
|
|
self.df = self.df[order]
|
|
|
|
self.df = self.df[order + list(comfort_columns.values())]
|
|
|
|
|
|
|
|
|
|
|
|
# 如果有 comfort_data,将其添加到数据框中
|
|
|
|
|
|
|
|
if hasattr(self, 'comfort_data') and self.comfort_data:
|
|
|
|
|
|
|
|
comfort_df = pd.DataFrame(self.comfort_data)
|
|
|
|
|
|
|
|
self.df = pd.concat([self.df, comfort_df], axis=1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
files_dir = os.path.join(
|
|
|
|
files_dir = os.path.join(
|
|
|
|
os.getcwd(), self.date, dt.now().strftime("%Y-%m-%d")
|
|
|
|
os.getcwd(), self.date, dt.now().strftime("%Y-%m-%d")
|
|
|
@ -1123,77 +1183,90 @@ class DataFetcher(object):
|
|
|
|
return 0
|
|
|
|
return 0
|
|
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
except Exception as e:
|
|
|
|
print(f"合并数据失败 {str(e).split('Stacktrace:')[0]}")
|
|
|
|
print(f"合并数据失败 {str(e)}")
|
|
|
|
|
|
|
|
print(f"错误类型: {type(e).__name__}")
|
|
|
|
|
|
|
|
print(f"错误详情: {str(e)}")
|
|
|
|
|
|
|
|
import traceback
|
|
|
|
|
|
|
|
print(f"错误堆栈: {traceback.format_exc()}")
|
|
|
|
return 0
|
|
|
|
return 0
|
|
|
|
|
|
|
|
|
|
|
|
def capture_flight_comfort_data(self):
|
|
|
|
def capture_flight_comfort_data(self):
|
|
|
|
try:
|
|
|
|
try:
|
|
|
|
# 等待并捕获所有 getFlightComfort 请求
|
|
|
|
# 滚动页面到底部以加载所有内容
|
|
|
|
|
|
|
|
last_height = self.driver.execute_script("return document.body.scrollHeight")
|
|
|
|
|
|
|
|
while True:
|
|
|
|
|
|
|
|
# 分步滚动页面
|
|
|
|
|
|
|
|
for i in range(10): # 将页面分成10步滚动
|
|
|
|
|
|
|
|
scroll_height = last_height * (i + 1) / 3
|
|
|
|
|
|
|
|
self.driver.execute_script(f"window.scrollTo(0, {scroll_height});")
|
|
|
|
|
|
|
|
time.sleep(0.5) # 每一小步等待0.5秒
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 等待页面加载
|
|
|
|
|
|
|
|
time.sleep(3) # 滚动到底部后多等待3秒
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 计算新的滚动高度并与最后的滚动高度进行比较
|
|
|
|
|
|
|
|
new_height = self.driver.execute_script("return document.body.scrollHeight")
|
|
|
|
|
|
|
|
if new_height == last_height:
|
|
|
|
|
|
|
|
break
|
|
|
|
|
|
|
|
last_height = new_height
|
|
|
|
|
|
|
|
|
|
|
|
comfort_requests = self.driver.requests
|
|
|
|
comfort_requests = self.driver.requests
|
|
|
|
comfort_data = []
|
|
|
|
comfort_data = {}
|
|
|
|
|
|
|
|
batch_comfort_found = False
|
|
|
|
|
|
|
|
getFlightComfort_requests_count = 0
|
|
|
|
|
|
|
|
total_requests_count = len(comfort_requests)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print(f"\n{time.strftime('%Y-%m-%d_%H-%M-%S')} 开始分析请求,总请求数:{total_requests_count}")
|
|
|
|
|
|
|
|
|
|
|
|
for request in comfort_requests:
|
|
|
|
for request in comfort_requests:
|
|
|
|
if "/international/search/api/flight/comfort/getFlightComfort" in request.url:
|
|
|
|
if "/search/api/flight/comfort/batchGetComfortTagList" in request.url:
|
|
|
|
print(f"\n{time.strftime('%Y-%m-%d_%H-%M-%S')} 捕获到 getFlightComfort 请求:")
|
|
|
|
batch_comfort_found = True
|
|
|
|
|
|
|
|
print(f"{time.strftime('%Y-%m-%d_%H-%M-%S')} 找到 batchGetComfortTagList 请求")
|
|
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if "/search/api/flight/comfort/getFlightComfort" in request.url:
|
|
|
|
|
|
|
|
getFlightComfort_requests_count += 1
|
|
|
|
|
|
|
|
print(f"\n{time.strftime('%Y-%m-%d_%H-%M-%S')} 捕获到第 {getFlightComfort_requests_count} 个 getFlightComfort 请求:")
|
|
|
|
print(f"URL: {request.url}")
|
|
|
|
print(f"URL: {request.url}")
|
|
|
|
print(f"Method: {request.method}")
|
|
|
|
|
|
|
|
print(f"Headers: {request.headers}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 提取请求 payload 中的航班号
|
|
|
|
|
|
|
|
try:
|
|
|
|
try:
|
|
|
|
payload = json.loads(request.body.decode('utf-8'))
|
|
|
|
payload = json.loads(request.body.decode('utf-8'))
|
|
|
|
flight_no = payload.get('flightNoList', ['Unknown'])[0]
|
|
|
|
flight_no = payload.get('flightNoList', ['Unknown'])[0]
|
|
|
|
print(f"Flight Number: {flight_no}")
|
|
|
|
print(f"请求的航班号: {flight_no}")
|
|
|
|
except Exception as e:
|
|
|
|
except Exception as e:
|
|
|
|
print(f"无法解析请求 payload: {str(e)}")
|
|
|
|
print(f"无法解析请求 payload: {str(e)}")
|
|
|
|
flight_no = 'Unknown'
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
if request.response:
|
|
|
|
if request.response:
|
|
|
|
print(f"\n{time.strftime('%Y-%m-%d_%H-%M-%S')} getFlightComfort 响应:")
|
|
|
|
print(f"响应状态码: {request.response.status_code}")
|
|
|
|
print(f"Status Code: {request.response.status_code}")
|
|
|
|
|
|
|
|
print(f"Headers: {request.response.headers}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 检查是否是gzip压缩的响应
|
|
|
|
|
|
|
|
content_encoding = request.response.headers.get('Content-Encoding', '').lower()
|
|
|
|
|
|
|
|
body = request.response.body
|
|
|
|
body = request.response.body
|
|
|
|
if content_encoding == 'gzip':
|
|
|
|
if request.response.headers.get('Content-Encoding', '').lower() == 'gzip':
|
|
|
|
body = gzip.decompress(body)
|
|
|
|
body = gzip.decompress(body)
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
try:
|
|
|
|
body_text = body.decode('utf-8')
|
|
|
|
json_data = json.loads(body.decode('utf-8'))
|
|
|
|
print(f"\n响应体内容:\n{body_text[:1000]}...") # 打印前1000个字符
|
|
|
|
print(f"响应数据: {json.dumps(json_data, indent=2, ensure_ascii=False)[:500]}...") # 打印前500个字符
|
|
|
|
|
|
|
|
|
|
|
|
json_data = json.loads(body_text)
|
|
|
|
|
|
|
|
if json_data['status'] == 0 and json_data['msg'] == 'success':
|
|
|
|
if json_data['status'] == 0 and json_data['msg'] == 'success':
|
|
|
|
flight_comfort = json_data['data']
|
|
|
|
flight_comfort = json_data['data']
|
|
|
|
|
|
|
|
|
|
|
|
# 提取准点率信息
|
|
|
|
|
|
|
|
punctuality = flight_comfort['punctualityInfo']
|
|
|
|
punctuality = flight_comfort['punctualityInfo']
|
|
|
|
|
|
|
|
|
|
|
|
# 提取飞机信息
|
|
|
|
|
|
|
|
plane_info = flight_comfort['planeInfo']
|
|
|
|
plane_info = flight_comfort['planeInfo']
|
|
|
|
|
|
|
|
|
|
|
|
# 提取舱位信息
|
|
|
|
|
|
|
|
cabin_info = {cabin['cabin']: cabin for cabin in flight_comfort['cabinInfoList']}
|
|
|
|
cabin_info = {cabin['cabin']: cabin for cabin in flight_comfort['cabinInfoList']}
|
|
|
|
|
|
|
|
|
|
|
|
processed_data = {
|
|
|
|
processed_data = {
|
|
|
|
'flight_no': flight_no,
|
|
|
|
|
|
|
|
'departure_delay_time': punctuality['departureDelaytime'],
|
|
|
|
'departure_delay_time': punctuality['departureDelaytime'],
|
|
|
|
'departure_bridge_rate': punctuality['departureBridge'],
|
|
|
|
'departure_bridge_rate': punctuality['departureBridge'],
|
|
|
|
'arrival_delay_time': punctuality['arrivalDelaytime'],
|
|
|
|
'arrival_delay_time': punctuality['arrivalDelaytime'],
|
|
|
|
'arrival_bridge_rate': punctuality['arrivalBridge'],
|
|
|
|
|
|
|
|
'plane_type': plane_info['planeTypeName'],
|
|
|
|
'plane_type': plane_info['planeTypeName'],
|
|
|
|
'plane_width': plane_info['planeWidthCategory'],
|
|
|
|
'plane_width': plane_info['planeWidthCategory'],
|
|
|
|
'plane_age': plane_info['planeAge']
|
|
|
|
'plane_age': plane_info['planeAge']
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
# 添加经济舱和商务舱信息
|
|
|
|
|
|
|
|
for cabin_type in ['Y', 'C']:
|
|
|
|
for cabin_type in ['Y', 'C']:
|
|
|
|
if cabin_type in cabin_info:
|
|
|
|
if cabin_type in cabin_info:
|
|
|
|
cabin = cabin_info[cabin_type]
|
|
|
|
cabin = cabin_info[cabin_type]
|
|
|
|
processed_data.update({
|
|
|
|
processed_data.update({
|
|
|
|
f'{cabin_type}_has_meal': cabin['hasMeal'],
|
|
|
|
f'{cabin_type}_has_meal': cabin['hasMeal'],
|
|
|
|
f'{cabin_type}_entertain_equipment': cabin['entertainEquipment'],
|
|
|
|
|
|
|
|
f'{cabin_type}_seat_tilt': cabin['seatTilt']['value'],
|
|
|
|
f'{cabin_type}_seat_tilt': cabin['seatTilt']['value'],
|
|
|
|
f'{cabin_type}_seat_width': cabin['seatWidth']['value'],
|
|
|
|
f'{cabin_type}_seat_width': cabin['seatWidth']['value'],
|
|
|
|
f'{cabin_type}_seat_pitch': cabin['seatPitch']['value'],
|
|
|
|
f'{cabin_type}_seat_pitch': cabin['seatPitch']['value'],
|
|
|
@ -1202,34 +1275,43 @@ class DataFetcher(object):
|
|
|
|
if 'power' in cabin:
|
|
|
|
if 'power' in cabin:
|
|
|
|
processed_data[f'{cabin_type}_power'] = cabin['power']
|
|
|
|
processed_data[f'{cabin_type}_power'] = cabin['power']
|
|
|
|
|
|
|
|
|
|
|
|
comfort_data.append(processed_data)
|
|
|
|
comfort_data[flight_no] = processed_data
|
|
|
|
print(f"{time.strftime('%Y-%m-%d_%H-%M-%S')} 成功提取航班 {flight_no} 的舒适度数据")
|
|
|
|
print(f"{time.strftime('%Y-%m-%d_%H-%M-%S')} 成功提取航班 {flight_no} 的舒适度数据")
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
print(f"{time.strftime('%Y-%m-%d_%H-%M-%S')} getFlightComfort 响应状态异常: {json_data['status']}, {json_data['msg']}")
|
|
|
|
print(f"{time.strftime('%Y-%m-%d_%H-%M-%S')} getFlightComfort 响应状态异常: {json_data['status']}, {json_data['msg']}")
|
|
|
|
except json.JSONDecodeError as je:
|
|
|
|
except Exception as e:
|
|
|
|
print(f"{time.strftime('%Y-%m-%d_%H-%M-%S')} 无法解析 getFlightComfort 响应的 JSON 数据: {str(je)}")
|
|
|
|
print(f"{time.strftime('%Y-%m-%d_%H-%M-%S')} 处理 getFlightComfort 响应时出错: {str(e)}")
|
|
|
|
except UnicodeDecodeError as ude:
|
|
|
|
|
|
|
|
print(f"{time.strftime('%Y-%m-%d_%H-%M-%S')} 无法解码响应体: {str(ude)}")
|
|
|
|
|
|
|
|
print(f"原始响应体 (前100字节): {body[:100]}")
|
|
|
|
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
print(f"{time.strftime('%Y-%m-%d_%H-%M-%S')} getFlightComfort 请求没有响应")
|
|
|
|
print(f"{time.strftime('%Y-%m-%d_%H-%M-%S')} getFlightComfort 请求没有响应")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print(f"\n{time.strftime('%Y-%m-%d_%H-%M-%S')} 请求分析完成")
|
|
|
|
|
|
|
|
print(f"总请求数: {total_requests_count}")
|
|
|
|
|
|
|
|
print(f"batchGetComfortTagList 请求是否找到: {batch_comfort_found}")
|
|
|
|
|
|
|
|
print(f"getFlightComfort 请求数: {getFlightComfort_requests_count}")
|
|
|
|
|
|
|
|
print(f"成功提取的舒适度数据数: {len(comfort_data)}")
|
|
|
|
|
|
|
|
|
|
|
|
if comfort_data:
|
|
|
|
if comfort_data:
|
|
|
|
# 创建 DataFrame
|
|
|
|
# 创建舒适度DataFrame
|
|
|
|
df = pd.DataFrame(comfort_data)
|
|
|
|
comfort_df = pd.DataFrame.from_dict(comfort_data, orient='index')
|
|
|
|
|
|
|
|
comfort_df.reset_index(inplace=True)
|
|
|
|
|
|
|
|
comfort_df.rename(columns={'index': 'flight_no'}, inplace=True)
|
|
|
|
|
|
|
|
|
|
|
|
# 创建保存目录
|
|
|
|
# 保存舒适度数据为CSV文件
|
|
|
|
save_dir = os.path.join(os.getcwd(), self.date, datetime.now().strftime("%Y-%m-%d"))
|
|
|
|
save_dir = os.path.join(os.getcwd(), self.date, datetime.now().strftime("%Y-%m-%d"))
|
|
|
|
os.makedirs(save_dir, exist_ok=True)
|
|
|
|
os.makedirs(save_dir, exist_ok=True)
|
|
|
|
|
|
|
|
|
|
|
|
# 保存为 CSV 文件
|
|
|
|
comfort_filename = os.path.join(save_dir, f"{self.city[0]}-{self.city[1]}_comfort.csv")
|
|
|
|
filename = os.path.join(save_dir, f"{self.city[0]}-{self.city[1]}_comfort.csv")
|
|
|
|
comfort_df.to_csv(comfort_filename, encoding="UTF-8", index=False)
|
|
|
|
df.to_csv(filename, encoding="UTF-8", index=False)
|
|
|
|
print(f"{time.strftime('%Y-%m-%d_%H-%M-%S')} 航班舒适度数据已保存到 {comfort_filename}")
|
|
|
|
print(f"{time.strftime('%Y-%m-%d_%H-%M-%S')} 航班舒适度数据已保存到 {filename}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return comfort_data
|
|
|
|
return comfort_data
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
print(f"{time.strftime('%Y-%m-%d_%H-%M-%S')} 未捕获到任何 getFlightComfort 数据")
|
|
|
|
print(f"{time.strftime('%Y-%m-%d_%H-%M-%S')} 未捕获到任何 getFlightComfort 数据")
|
|
|
|
|
|
|
|
print("可能的原因:")
|
|
|
|
|
|
|
|
print("1. 网页没有加载完全")
|
|
|
|
|
|
|
|
print("2. 网站结构可能已经改变")
|
|
|
|
|
|
|
|
print("3. 网络连接问题")
|
|
|
|
|
|
|
|
print("4. 请求被网站拦截或限制")
|
|
|
|
return None
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
except Exception as e:
|
|
|
|