diff --git a/ctrip_flights_scraper_V3.py b/ctrip_flights_scraper_V3.py
index 30c14d6..f1ec05c 100644
--- a/ctrip_flights_scraper_V3.py
+++ b/ctrip_flights_scraper_V3.py
@@ -16,13 +16,13 @@ from datetime import datetime
# 爬取的城市
-crawal_citys = ["北京", "泉州"]
+crawal_citys = ["天津", "贵阳"]
# 爬取日期范围:起始日期。格式'2023-12-01'
-begin_date = "2024-10-13"
+begin_date = "2024-10-21"
# 爬取日期范围:结束日期。格式'2023-12-31'
-end_date = "2024-10-15"
+end_date = "2024-10-22"
# 爬取T+N,即N天后
start_interval = 1
@@ -43,7 +43,7 @@ max_wait_time = 10
max_retry_time = 5
# 是否只抓取直飞信息(True: 只抓取直飞,False: 抓取所有航班)
-direct_flight = True
+direct_flight = False
# 是否删除不重要的信息
del_info = False
@@ -116,7 +116,7 @@ def init_driver():
def gen_citys(crawal_citys):
- # 生成城市组合列表
+ # 生成城市组合表
citys = []
ytic = list(reversed(crawal_citys))
for m in crawal_citys:
@@ -179,6 +179,7 @@ class DataFetcher(object):
self.city = None
self.err = 0 # 错误重试次数
self.switch_acc = 0 #切换账户
+ self.comfort_data = None # 新添加的属性
def refresh_driver(self):
try:
@@ -217,7 +218,7 @@ class DataFetcher(object):
# 移除分享链接
self.driver.execute_script("document.querySelectorAll('.shareline').forEach(element => element.remove());")
'''
- # 使用JavaScript删除所有的
标签
+ # 使用JavaScript除有的标签
self.driver.execute_script("""
var elements = document.getElementsByTagName('dl');
while(elements.length > 0){
@@ -233,17 +234,22 @@ class DataFetcher(object):
def check_verification_code(self):
try:
# 检查是否有验证码元素,如果有,则需要人工处理
- if (len(self.driver.find_elements(By.ID, "verification-code"))+len(self.driver.find_elements(By.CLASS_NAME, "alert-title"))):
- print(
- f'{time.strftime("%Y-%m-%d_%H-%M-%S")} check_verification_code:验证码被触发verification-code/alert-title,等待{crawal_interval*100}后重试。'
+ if (len(self.driver.find_elements(By.ID, "verification-code")) + len(self.driver.find_elements(By.CLASS_NAME, "alert-title"))):
+ print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} check_verification_code:验证码被触发verification-code/alert-title,请手动完成验证。')
+
+ # 等待用户手动处理验证码
+ input("请完成验证码,然后按回车键继续...")
+
+ # 等待页面加载完成
+ WebDriverWait(self.driver, max_wait_time).until(
+ EC.presence_of_element_located((By.CLASS_NAME, "pc_home-jipiao"))
)
- self.driver.quit()
- time.sleep(crawal_interval*100)
- self.driver = init_driver()
- self.err = 0
- self.switch_acc += 1
- self.get_page(1)
- return False
+
+ print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} check_verification_code:验证码处理完成,继续执行。')
+
+ # 移除注意事项
+ self.remove_btn()
+ return True
else:
# 移除注意事项
self.remove_btn()
@@ -253,6 +259,7 @@ class DataFetcher(object):
print(
f'{time.strftime("%Y-%m-%d_%H-%M-%S")} check_verification_code:未知错误,错误类型:{type(e).__name__}, 详细错误信息:{str(e).split("Stacktrace:")[0]}'
)
+ return False
def login(self):
if login_allowed:
@@ -647,7 +654,7 @@ class DataFetcher(object):
)
print(
- f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 错误次数【{self.err}-{max_retry_time}】,change_city:更换城市和日期失败,错误类型:{type(e).__name__}, 详细错误信息:{str(e).split("Stacktrace:")[0]}'
+ f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 错误次数【{self.err}-{max_retry_time}】,change_city:更换市和日期失败,错误类型:{type(e).__name__}, 详细错误信息:{str(e).split("Stacktrace:")[0]}'
)
# 检查注意事项和验证码
@@ -689,11 +696,12 @@ class DataFetcher(object):
self.predata = self.driver.wait_for_request(
"/international/search/api/search/batchSearch?.*", timeout=max_wait_time
)
-
- rb = dict(json.loads(self.predata.body).get("flightSegments")[0])
-
# 捕获 getFlightComfort 数据
self.comfort_data = self.capture_flight_comfort_data()
+
+ rb = dict(json.loads(self.predata.body).get("flightSegments")[0])
+
+
except Exception as e:
# 错误次数+1
@@ -761,7 +769,7 @@ class DataFetcher(object):
f'screenshot/screenshot_{time.strftime("%Y-%m-%d_%H-%M-%S")}.png'
)
- # 重新更换城市
+ # 重新更换城
print(
f'{time.strftime("%Y-%m-%d_%H-%M-%S")} get_data:重新更换城市:{rb["departureCityName"]}-{rb["arrivalCityName"]}-{rb["departureDate"]}'
)
@@ -813,7 +821,7 @@ class DataFetcher(object):
# 检查注意事项和验证码
if self.check_verification_code():
- # 重试
+ # 试
self.get_data()
# 判错误次数
if self.err >= max_retry_time:
@@ -830,7 +838,7 @@ class DataFetcher(object):
# 重置错误计数
self.err = 0
- # 若无错误,执行下一步
+ # 若无误,执行下一步
self.check_data()
def check_data(self):
@@ -1041,9 +1049,43 @@ class DataFetcher(object):
def mergedata(self):
try:
self.df = self.flights.merge(self.prices, on=["flightNo"])
+ print(f"合并后的航班数据形状: {self.df.shape}")
+ print(f"合并后的航班数据列: {self.df.columns}")
self.df["dateGetTime"] = dt.now().strftime("%Y-%m-%d")
+ print(f"获取到的舒适度数据: {self.comfort_data}")
+
+ if self.comfort_data:
+ comfort_df = pd.DataFrame.from_dict(self.comfort_data, orient='index')
+ comfort_df.reset_index(inplace=True)
+ comfort_df.rename(columns={'index': 'flight_no'}, inplace=True)
+
+ print(f"舒适度数据形状: {comfort_df.shape}")
+ print(f"舒适度数据列: {comfort_df.columns}")
+ print(f"舒适度数据前几行: \n{comfort_df.head()}")
+
+ # 检查 operateFlightNo 列是否存在
+ if 'operateFlightNo' in self.df.columns:
+ print(f"合并前的 operateFlightNo 唯一值: {self.df['operateFlightNo'].unique()}")
+ else:
+ print("警告: operateFlightNo 列不存在于数据中")
+ print(f"现有的列: {self.df.columns}")
+
+ print(f"合并前的 flight_no 唯一值: {comfort_df['flight_no'].unique()}")
+
+ # 创建一个临时列来存储用于匹配的航班号
+ self.df['match_flight_no'] = self.df['operateFlightNo'].fillna(self.df['flightNo'])
+
+ # 使用 left join 来合并数据
+ self.df = self.df.merge(comfort_df, left_on='match_flight_no', right_on='flight_no', how='left')
+
+ print(f"合并后的数据形状: {self.df.shape}")
+ print(f"合并后的数据列: {self.df.columns}")
+
+ # 删除临时列和多余的flight_no列
+ self.df.drop(['match_flight_no', 'flight_no'], axis=1, inplace=True)
+
if rename_col:
# 对pandas的columns进行重命名
order = [
@@ -1096,15 +1138,33 @@ class DataFetcher(object):
columns = dict(zip(origin, order))
+ # 添加舒适度数据的列名映射
+ comfort_columns = {
+ 'departure_delay_time': '出发延误时间',
+ 'departure_bridge_rate': '出发廊桥率',
+ 'arrival_delay_time': '到达延误时间',
+ 'plane_type': '飞机类型',
+ 'plane_width': '飞机宽度',
+ 'plane_age': '飞机机龄',
+ 'Y_has_meal': '经济舱是否有餐食',
+ 'Y_seat_tilt': '经济舱座椅倾斜度',
+ 'Y_seat_width': '经济舱座椅宽度',
+ 'Y_seat_pitch': '经济舱座椅间距',
+ 'Y_meal_msg': '经济舱餐食信息',
+ 'Y_power': '经济舱电源',
+ 'C_has_meal': '商务舱是否有餐食',
+ 'C_seat_tilt': '商务舱座椅倾斜度',
+ 'C_seat_width': '商务舱座椅宽度',
+ 'C_seat_pitch': '商务舱座椅间距',
+ 'C_meal_msg': '商务舱餐食信息',
+ 'C_power': '商务舱电源',
+ }
+ columns.update(comfort_columns)
+
self.df = self.df.rename(columns=columns)
if del_info:
- self.df = self.df[order]
-
- # 如果有 comfort_data,将其添加到数据框中
- if hasattr(self, 'comfort_data') and self.comfort_data:
- comfort_df = pd.DataFrame(self.comfort_data)
- self.df = pd.concat([self.df, comfort_df], axis=1)
+ self.df = self.df[order + list(comfort_columns.values())]
files_dir = os.path.join(
os.getcwd(), self.date, dt.now().strftime("%Y-%m-%d")
@@ -1123,77 +1183,90 @@ class DataFetcher(object):
return 0
except Exception as e:
- print(f"合并数据失败 {str(e).split('Stacktrace:')[0]}")
+ print(f"合并数据失败 {str(e)}")
+ print(f"错误类型: {type(e).__name__}")
+ print(f"错误详情: {str(e)}")
+ import traceback
+ print(f"错误堆栈: {traceback.format_exc()}")
return 0
def capture_flight_comfort_data(self):
try:
- # 等待并捕获所有 getFlightComfort 请求
+ # 滚动页面到底部以加载所有内容
+ last_height = self.driver.execute_script("return document.body.scrollHeight")
+ while True:
+ # 分步滚动页面
+ for i in range(10): # 将页面分成10步滚动
+ scroll_height = last_height * (i + 1) / 3
+ self.driver.execute_script(f"window.scrollTo(0, {scroll_height});")
+ time.sleep(0.5) # 每一小步等待0.5秒
+
+ # 等待页面加载
+ time.sleep(3) # 滚动到底部后多等待3秒
+
+ # 计算新的滚动高度并与最后的滚动高度进行比较
+ new_height = self.driver.execute_script("return document.body.scrollHeight")
+ if new_height == last_height:
+ break
+ last_height = new_height
+
comfort_requests = self.driver.requests
- comfort_data = []
+ comfort_data = {}
+ batch_comfort_found = False
+ getFlightComfort_requests_count = 0
+ total_requests_count = len(comfort_requests)
+
+ print(f"\n{time.strftime('%Y-%m-%d_%H-%M-%S')} 开始分析请求,总请求数:{total_requests_count}")
for request in comfort_requests:
- if "/international/search/api/flight/comfort/getFlightComfort" in request.url:
- print(f"\n{time.strftime('%Y-%m-%d_%H-%M-%S')} 捕获到 getFlightComfort 请求:")
+ if "/search/api/flight/comfort/batchGetComfortTagList" in request.url:
+ batch_comfort_found = True
+ print(f"{time.strftime('%Y-%m-%d_%H-%M-%S')} 找到 batchGetComfortTagList 请求")
+ continue
+
+ if "/search/api/flight/comfort/getFlightComfort" in request.url:
+ getFlightComfort_requests_count += 1
+ print(f"\n{time.strftime('%Y-%m-%d_%H-%M-%S')} 捕获到第 {getFlightComfort_requests_count} 个 getFlightComfort 请求:")
print(f"URL: {request.url}")
- print(f"Method: {request.method}")
- print(f"Headers: {request.headers}")
- # 提取请求 payload 中的航班号
try:
payload = json.loads(request.body.decode('utf-8'))
flight_no = payload.get('flightNoList', ['Unknown'])[0]
- print(f"Flight Number: {flight_no}")
+ print(f"请求的航班号: {flight_no}")
except Exception as e:
print(f"无法解析请求 payload: {str(e)}")
- flight_no = 'Unknown'
+ continue
if request.response:
- print(f"\n{time.strftime('%Y-%m-%d_%H-%M-%S')} getFlightComfort 响应:")
- print(f"Status Code: {request.response.status_code}")
- print(f"Headers: {request.response.headers}")
-
- # 检查是否是gzip压缩的响应
- content_encoding = request.response.headers.get('Content-Encoding', '').lower()
+ print(f"响应状态码: {request.response.status_code}")
body = request.response.body
- if content_encoding == 'gzip':
+ if request.response.headers.get('Content-Encoding', '').lower() == 'gzip':
body = gzip.decompress(body)
try:
- body_text = body.decode('utf-8')
- print(f"\n响应体内容:\n{body_text[:1000]}...") # 打印前1000个字符
-
- json_data = json.loads(body_text)
+ json_data = json.loads(body.decode('utf-8'))
+ print(f"响应数据: {json.dumps(json_data, indent=2, ensure_ascii=False)[:500]}...") # 打印前500个字符
if json_data['status'] == 0 and json_data['msg'] == 'success':
flight_comfort = json_data['data']
- # 提取准点率信息
punctuality = flight_comfort['punctualityInfo']
-
- # 提取飞机信息
plane_info = flight_comfort['planeInfo']
-
- # 提取舱位信息
cabin_info = {cabin['cabin']: cabin for cabin in flight_comfort['cabinInfoList']}
processed_data = {
- 'flight_no': flight_no,
'departure_delay_time': punctuality['departureDelaytime'],
'departure_bridge_rate': punctuality['departureBridge'],
'arrival_delay_time': punctuality['arrivalDelaytime'],
- 'arrival_bridge_rate': punctuality['arrivalBridge'],
'plane_type': plane_info['planeTypeName'],
'plane_width': plane_info['planeWidthCategory'],
'plane_age': plane_info['planeAge']
}
- # 添加经济舱和商务舱信息
for cabin_type in ['Y', 'C']:
if cabin_type in cabin_info:
cabin = cabin_info[cabin_type]
processed_data.update({
f'{cabin_type}_has_meal': cabin['hasMeal'],
- f'{cabin_type}_entertain_equipment': cabin['entertainEquipment'],
f'{cabin_type}_seat_tilt': cabin['seatTilt']['value'],
f'{cabin_type}_seat_width': cabin['seatWidth']['value'],
f'{cabin_type}_seat_pitch': cabin['seatPitch']['value'],
@@ -1202,34 +1275,43 @@ class DataFetcher(object):
if 'power' in cabin:
processed_data[f'{cabin_type}_power'] = cabin['power']
- comfort_data.append(processed_data)
+ comfort_data[flight_no] = processed_data
print(f"{time.strftime('%Y-%m-%d_%H-%M-%S')} 成功提取航班 {flight_no} 的舒适度数据")
else:
print(f"{time.strftime('%Y-%m-%d_%H-%M-%S')} getFlightComfort 响应状态异常: {json_data['status']}, {json_data['msg']}")
- except json.JSONDecodeError as je:
- print(f"{time.strftime('%Y-%m-%d_%H-%M-%S')} 无法解析 getFlightComfort 响应的 JSON 数据: {str(je)}")
- except UnicodeDecodeError as ude:
- print(f"{time.strftime('%Y-%m-%d_%H-%M-%S')} 无法解码响应体: {str(ude)}")
- print(f"原始响应体 (前100字节): {body[:100]}")
+ except Exception as e:
+ print(f"{time.strftime('%Y-%m-%d_%H-%M-%S')} 处理 getFlightComfort 响应时出错: {str(e)}")
else:
print(f"{time.strftime('%Y-%m-%d_%H-%M-%S')} getFlightComfort 请求没有响应")
+ print(f"\n{time.strftime('%Y-%m-%d_%H-%M-%S')} 请求分析完成")
+ print(f"总请求数: {total_requests_count}")
+ print(f"batchGetComfortTagList 请求是否找到: {batch_comfort_found}")
+ print(f"getFlightComfort 请求数: {getFlightComfort_requests_count}")
+ print(f"成功提取的舒适度数据数: {len(comfort_data)}")
+
if comfort_data:
- # 创建 DataFrame
- df = pd.DataFrame(comfort_data)
+ # 创建舒适度DataFrame
+ comfort_df = pd.DataFrame.from_dict(comfort_data, orient='index')
+ comfort_df.reset_index(inplace=True)
+ comfort_df.rename(columns={'index': 'flight_no'}, inplace=True)
- # 创建保存目录
+ # 保存舒适度数据为CSV文件
save_dir = os.path.join(os.getcwd(), self.date, datetime.now().strftime("%Y-%m-%d"))
os.makedirs(save_dir, exist_ok=True)
- # 保存为 CSV 文件
- filename = os.path.join(save_dir, f"{self.city[0]}-{self.city[1]}_comfort.csv")
- df.to_csv(filename, encoding="UTF-8", index=False)
- print(f"{time.strftime('%Y-%m-%d_%H-%M-%S')} 航班舒适度数据已保存到 {filename}")
+ comfort_filename = os.path.join(save_dir, f"{self.city[0]}-{self.city[1]}_comfort.csv")
+ comfort_df.to_csv(comfort_filename, encoding="UTF-8", index=False)
+ print(f"{time.strftime('%Y-%m-%d_%H-%M-%S')} 航班舒适度数据已保存到 {comfort_filename}")
return comfort_data
else:
print(f"{time.strftime('%Y-%m-%d_%H-%M-%S')} 未捕获到任何 getFlightComfort 数据")
+ print("可能的原因:")
+ print("1. 网页没有加载完全")
+ print("2. 网站结构可能已经改变")
+ print("3. 网络连接问题")
+ print("4. 请求被网站拦截或限制")
return None
except Exception as e: