merge all comfort info

dev-clawer
Lin 1 month ago
parent 37a072fadc
commit 3f9776c4b1

@ -16,13 +16,13 @@ from datetime import datetime
# 爬取的城市
crawal_citys = ["北京", "泉州"]
crawal_citys = ["天津", "贵阳"]
# 爬取日期范围:起始日期。格式'2023-12-01'
begin_date = "2024-10-13"
begin_date = "2024-10-21"
# 爬取日期范围:结束日期。格式'2023-12-31'
end_date = "2024-10-15"
end_date = "2024-10-22"
# 爬取T+N即N天后
start_interval = 1
@ -43,7 +43,7 @@ max_wait_time = 10
max_retry_time = 5
# 是否只抓取直飞信息True: 只抓取直飞False: 抓取所有航班)
direct_flight = True
direct_flight = False
# 是否删除不重要的信息
del_info = False
@ -116,7 +116,7 @@ def init_driver():
def gen_citys(crawal_citys):
# 生成城市组合
# 生成城市组合
citys = []
ytic = list(reversed(crawal_citys))
for m in crawal_citys:
@ -179,6 +179,7 @@ class DataFetcher(object):
self.city = None
self.err = 0 # 错误重试次数
self.switch_acc = 0 #切换账户
self.comfort_data = None # 新添加的属性
def refresh_driver(self):
try:
@ -217,7 +218,7 @@ class DataFetcher(object):
# 移除分享链接
self.driver.execute_script("document.querySelectorAll('.shareline').forEach(element => element.remove());")
'''
# 使用JavaScript有的<dl>标签
# 使用JavaScript除有的<dl>标签
self.driver.execute_script("""
var elements = document.getElementsByTagName('dl');
while(elements.length > 0){
@ -233,17 +234,22 @@ class DataFetcher(object):
def check_verification_code(self):
try:
# 检查是否有验证码元素,如果有,则需要人工处理
if (len(self.driver.find_elements(By.ID, "verification-code"))+len(self.driver.find_elements(By.CLASS_NAME, "alert-title"))):
print(
f'{time.strftime("%Y-%m-%d_%H-%M-%S")} check_verification_code验证码被触发verification-code/alert-title等待{crawal_interval*100}后重试。'
if (len(self.driver.find_elements(By.ID, "verification-code")) + len(self.driver.find_elements(By.CLASS_NAME, "alert-title"))):
print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} check_verification_code验证码被触发verification-code/alert-title请手动完成验证。')
# 等待用户手动处理验证码
input("请完成验证码,然后按回车键继续...")
# 等待页面加载完成
WebDriverWait(self.driver, max_wait_time).until(
EC.presence_of_element_located((By.CLASS_NAME, "pc_home-jipiao"))
)
self.driver.quit()
time.sleep(crawal_interval*100)
self.driver = init_driver()
self.err = 0
self.switch_acc += 1
self.get_page(1)
return False
print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} check_verification_code验证码处理完成继续执行。')
# 移除注意事项
self.remove_btn()
return True
else:
# 移除注意事项
self.remove_btn()
@ -253,6 +259,7 @@ class DataFetcher(object):
print(
f'{time.strftime("%Y-%m-%d_%H-%M-%S")} check_verification_code:未知错误,错误类型:{type(e).__name__}, 详细错误信息:{str(e).split("Stacktrace:")[0]}'
)
return False
def login(self):
if login_allowed:
@ -647,7 +654,7 @@ class DataFetcher(object):
)
print(
f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 错误次数【{self.err}-{max_retry_time}】,change_city更换市和日期失败,错误类型:{type(e).__name__}, 详细错误信息:{str(e).split("Stacktrace:")[0]}'
f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 错误次数【{self.err}-{max_retry_time}】,change_city更换市和日期失败,错误类型:{type(e).__name__}, 详细错误信息:{str(e).split("Stacktrace:")[0]}'
)
# 检查注意事项和验证码
@ -689,11 +696,12 @@ class DataFetcher(object):
self.predata = self.driver.wait_for_request(
"/international/search/api/search/batchSearch?.*", timeout=max_wait_time
)
rb = dict(json.loads(self.predata.body).get("flightSegments")[0])
# 捕获 getFlightComfort 数据
self.comfort_data = self.capture_flight_comfort_data()
rb = dict(json.loads(self.predata.body).get("flightSegments")[0])
except Exception as e:
# 错误次数+1
@ -761,7 +769,7 @@ class DataFetcher(object):
f'screenshot/screenshot_{time.strftime("%Y-%m-%d_%H-%M-%S")}.png'
)
# 重新更换城
# 重新更换城
print(
f'{time.strftime("%Y-%m-%d_%H-%M-%S")} get_data重新更换城市:{rb["departureCityName"]}-{rb["arrivalCityName"]}-{rb["departureDate"]}'
)
@ -813,7 +821,7 @@ class DataFetcher(object):
# 检查注意事项和验证码
if self.check_verification_code():
#
#
self.get_data()
# 判错误次数
if self.err >= max_retry_time:
@ -830,7 +838,7 @@ class DataFetcher(object):
# 重置错误计数
self.err = 0
# 若无误,执行下一步
# 若无误,执行下一步
self.check_data()
def check_data(self):
@ -1041,9 +1049,43 @@ class DataFetcher(object):
def mergedata(self):
try:
self.df = self.flights.merge(self.prices, on=["flightNo"])
print(f"合并后的航班数据形状: {self.df.shape}")
print(f"合并后的航班数据列: {self.df.columns}")
self.df["dateGetTime"] = dt.now().strftime("%Y-%m-%d")
print(f"获取到的舒适度数据: {self.comfort_data}")
if self.comfort_data:
comfort_df = pd.DataFrame.from_dict(self.comfort_data, orient='index')
comfort_df.reset_index(inplace=True)
comfort_df.rename(columns={'index': 'flight_no'}, inplace=True)
print(f"舒适度数据形状: {comfort_df.shape}")
print(f"舒适度数据列: {comfort_df.columns}")
print(f"舒适度数据前几行: \n{comfort_df.head()}")
# 检查 operateFlightNo 列是否存在
if 'operateFlightNo' in self.df.columns:
print(f"合并前的 operateFlightNo 唯一值: {self.df['operateFlightNo'].unique()}")
else:
print("警告: operateFlightNo 列不存在于数据中")
print(f"现有的列: {self.df.columns}")
print(f"合并前的 flight_no 唯一值: {comfort_df['flight_no'].unique()}")
# 创建一个临时列来存储用于匹配的航班号
self.df['match_flight_no'] = self.df['operateFlightNo'].fillna(self.df['flightNo'])
# 使用 left join 来合并数据
self.df = self.df.merge(comfort_df, left_on='match_flight_no', right_on='flight_no', how='left')
print(f"合并后的数据形状: {self.df.shape}")
print(f"合并后的数据列: {self.df.columns}")
# 删除临时列和多余的flight_no列
self.df.drop(['match_flight_no', 'flight_no'], axis=1, inplace=True)
if rename_col:
# 对pandas的columns进行重命名
order = [
@ -1096,15 +1138,33 @@ class DataFetcher(object):
columns = dict(zip(origin, order))
# 添加舒适度数据的列名映射
comfort_columns = {
'departure_delay_time': '出发延误时间',
'departure_bridge_rate': '出发廊桥率',
'arrival_delay_time': '到达延误时间',
'plane_type': '飞机类型',
'plane_width': '飞机宽度',
'plane_age': '飞机机龄',
'Y_has_meal': '经济舱是否有餐食',
'Y_seat_tilt': '经济舱座椅倾斜度',
'Y_seat_width': '经济舱座椅宽度',
'Y_seat_pitch': '经济舱座椅间距',
'Y_meal_msg': '经济舱餐食信息',
'Y_power': '经济舱电源',
'C_has_meal': '商务舱是否有餐食',
'C_seat_tilt': '商务舱座椅倾斜度',
'C_seat_width': '商务舱座椅宽度',
'C_seat_pitch': '商务舱座椅间距',
'C_meal_msg': '商务舱餐食信息',
'C_power': '商务舱电源',
}
columns.update(comfort_columns)
self.df = self.df.rename(columns=columns)
if del_info:
self.df = self.df[order]
# 如果有 comfort_data将其添加到数据框中
if hasattr(self, 'comfort_data') and self.comfort_data:
comfort_df = pd.DataFrame(self.comfort_data)
self.df = pd.concat([self.df, comfort_df], axis=1)
self.df = self.df[order + list(comfort_columns.values())]
files_dir = os.path.join(
os.getcwd(), self.date, dt.now().strftime("%Y-%m-%d")
@ -1123,77 +1183,90 @@ class DataFetcher(object):
return 0
except Exception as e:
print(f"合并数据失败 {str(e).split('Stacktrace:')[0]}")
print(f"合并数据失败 {str(e)}")
print(f"错误类型: {type(e).__name__}")
print(f"错误详情: {str(e)}")
import traceback
print(f"错误堆栈: {traceback.format_exc()}")
return 0
def capture_flight_comfort_data(self):
try:
# 等待并捕获所有 getFlightComfort 请求
# 滚动页面到底部以加载所有内容
last_height = self.driver.execute_script("return document.body.scrollHeight")
while True:
# 分步滚动页面
for i in range(10): # 将页面分成10步滚动
scroll_height = last_height * (i + 1) / 3
self.driver.execute_script(f"window.scrollTo(0, {scroll_height});")
time.sleep(0.5) # 每一小步等待0.5秒
# 等待页面加载
time.sleep(3) # 滚动到底部后多等待3秒
# 计算新的滚动高度并与最后的滚动高度进行比较
new_height = self.driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
comfort_requests = self.driver.requests
comfort_data = []
comfort_data = {}
batch_comfort_found = False
getFlightComfort_requests_count = 0
total_requests_count = len(comfort_requests)
print(f"\n{time.strftime('%Y-%m-%d_%H-%M-%S')} 开始分析请求,总请求数:{total_requests_count}")
for request in comfort_requests:
if "/international/search/api/flight/comfort/getFlightComfort" in request.url:
print(f"\n{time.strftime('%Y-%m-%d_%H-%M-%S')} 捕获到 getFlightComfort 请求:")
if "/search/api/flight/comfort/batchGetComfortTagList" in request.url:
batch_comfort_found = True
print(f"{time.strftime('%Y-%m-%d_%H-%M-%S')} 找到 batchGetComfortTagList 请求")
continue
if "/search/api/flight/comfort/getFlightComfort" in request.url:
getFlightComfort_requests_count += 1
print(f"\n{time.strftime('%Y-%m-%d_%H-%M-%S')} 捕获到第 {getFlightComfort_requests_count} 个 getFlightComfort 请求:")
print(f"URL: {request.url}")
print(f"Method: {request.method}")
print(f"Headers: {request.headers}")
# 提取请求 payload 中的航班号
try:
payload = json.loads(request.body.decode('utf-8'))
flight_no = payload.get('flightNoList', ['Unknown'])[0]
print(f"Flight Number: {flight_no}")
print(f"请求的航班号: {flight_no}")
except Exception as e:
print(f"无法解析请求 payload: {str(e)}")
flight_no = 'Unknown'
continue
if request.response:
print(f"\n{time.strftime('%Y-%m-%d_%H-%M-%S')} getFlightComfort 响应:")
print(f"Status Code: {request.response.status_code}")
print(f"Headers: {request.response.headers}")
# 检查是否是gzip压缩的响应
content_encoding = request.response.headers.get('Content-Encoding', '').lower()
print(f"响应状态码: {request.response.status_code}")
body = request.response.body
if content_encoding == 'gzip':
if request.response.headers.get('Content-Encoding', '').lower() == 'gzip':
body = gzip.decompress(body)
try:
body_text = body.decode('utf-8')
print(f"\n响应体内容:\n{body_text[:1000]}...") # 打印前1000个字符
json_data = json.loads(body_text)
json_data = json.loads(body.decode('utf-8'))
print(f"响应数据: {json.dumps(json_data, indent=2, ensure_ascii=False)[:500]}...") # 打印前500个字符
if json_data['status'] == 0 and json_data['msg'] == 'success':
flight_comfort = json_data['data']
# 提取准点率信息
punctuality = flight_comfort['punctualityInfo']
# 提取飞机信息
plane_info = flight_comfort['planeInfo']
# 提取舱位信息
cabin_info = {cabin['cabin']: cabin for cabin in flight_comfort['cabinInfoList']}
processed_data = {
'flight_no': flight_no,
'departure_delay_time': punctuality['departureDelaytime'],
'departure_bridge_rate': punctuality['departureBridge'],
'arrival_delay_time': punctuality['arrivalDelaytime'],
'arrival_bridge_rate': punctuality['arrivalBridge'],
'plane_type': plane_info['planeTypeName'],
'plane_width': plane_info['planeWidthCategory'],
'plane_age': plane_info['planeAge']
}
# 添加经济舱和商务舱信息
for cabin_type in ['Y', 'C']:
if cabin_type in cabin_info:
cabin = cabin_info[cabin_type]
processed_data.update({
f'{cabin_type}_has_meal': cabin['hasMeal'],
f'{cabin_type}_entertain_equipment': cabin['entertainEquipment'],
f'{cabin_type}_seat_tilt': cabin['seatTilt']['value'],
f'{cabin_type}_seat_width': cabin['seatWidth']['value'],
f'{cabin_type}_seat_pitch': cabin['seatPitch']['value'],
@ -1202,34 +1275,43 @@ class DataFetcher(object):
if 'power' in cabin:
processed_data[f'{cabin_type}_power'] = cabin['power']
comfort_data.append(processed_data)
comfort_data[flight_no] = processed_data
print(f"{time.strftime('%Y-%m-%d_%H-%M-%S')} 成功提取航班 {flight_no} 的舒适度数据")
else:
print(f"{time.strftime('%Y-%m-%d_%H-%M-%S')} getFlightComfort 响应状态异常: {json_data['status']}, {json_data['msg']}")
except json.JSONDecodeError as je:
print(f"{time.strftime('%Y-%m-%d_%H-%M-%S')} 无法解析 getFlightComfort 响应的 JSON 数据: {str(je)}")
except UnicodeDecodeError as ude:
print(f"{time.strftime('%Y-%m-%d_%H-%M-%S')} 无法解码响应体: {str(ude)}")
print(f"原始响应体 (前100字节): {body[:100]}")
except Exception as e:
print(f"{time.strftime('%Y-%m-%d_%H-%M-%S')} 处理 getFlightComfort 响应时出错: {str(e)}")
else:
print(f"{time.strftime('%Y-%m-%d_%H-%M-%S')} getFlightComfort 请求没有响应")
print(f"\n{time.strftime('%Y-%m-%d_%H-%M-%S')} 请求分析完成")
print(f"总请求数: {total_requests_count}")
print(f"batchGetComfortTagList 请求是否找到: {batch_comfort_found}")
print(f"getFlightComfort 请求数: {getFlightComfort_requests_count}")
print(f"成功提取的舒适度数据数: {len(comfort_data)}")
if comfort_data:
# 创建 DataFrame
df = pd.DataFrame(comfort_data)
# 创建舒适度DataFrame
comfort_df = pd.DataFrame.from_dict(comfort_data, orient='index')
comfort_df.reset_index(inplace=True)
comfort_df.rename(columns={'index': 'flight_no'}, inplace=True)
# 创建保存目录
# 保存舒适度数据为CSV文件
save_dir = os.path.join(os.getcwd(), self.date, datetime.now().strftime("%Y-%m-%d"))
os.makedirs(save_dir, exist_ok=True)
# 保存为 CSV 文件
filename = os.path.join(save_dir, f"{self.city[0]}-{self.city[1]}_comfort.csv")
df.to_csv(filename, encoding="UTF-8", index=False)
print(f"{time.strftime('%Y-%m-%d_%H-%M-%S')} 航班舒适度数据已保存到 {filename}")
comfort_filename = os.path.join(save_dir, f"{self.city[0]}-{self.city[1]}_comfort.csv")
comfort_df.to_csv(comfort_filename, encoding="UTF-8", index=False)
print(f"{time.strftime('%Y-%m-%d_%H-%M-%S')} 航班舒适度数据已保存到 {comfort_filename}")
return comfort_data
else:
print(f"{time.strftime('%Y-%m-%d_%H-%M-%S')} 未捕获到任何 getFlightComfort 数据")
print("可能的原因:")
print("1. 网页没有加载完全")
print("2. 网站结构可能已经改变")
print("3. 网络连接问题")
print("4. 请求被网站拦截或限制")
return None
except Exception as e:

Loading…
Cancel
Save