diff --git a/ctrip_flights_scraper_V3.py b/ctrip_flights_scraper_V3.py index 30c14d6..f1ec05c 100644 --- a/ctrip_flights_scraper_V3.py +++ b/ctrip_flights_scraper_V3.py @@ -16,13 +16,13 @@ from datetime import datetime # 爬取的城市 -crawal_citys = ["北京", "泉州"] +crawal_citys = ["天津", "贵阳"] # 爬取日期范围:起始日期。格式'2023-12-01' -begin_date = "2024-10-13" +begin_date = "2024-10-21" # 爬取日期范围:结束日期。格式'2023-12-31' -end_date = "2024-10-15" +end_date = "2024-10-22" # 爬取T+N,即N天后 start_interval = 1 @@ -43,7 +43,7 @@ max_wait_time = 10 max_retry_time = 5 # 是否只抓取直飞信息(True: 只抓取直飞,False: 抓取所有航班) -direct_flight = True +direct_flight = False # 是否删除不重要的信息 del_info = False @@ -116,7 +116,7 @@ def init_driver(): def gen_citys(crawal_citys): - # 生成城市组合列表 + # 生成城市组合表 citys = [] ytic = list(reversed(crawal_citys)) for m in crawal_citys: @@ -179,6 +179,7 @@ class DataFetcher(object): self.city = None self.err = 0 # 错误重试次数 self.switch_acc = 0 #切换账户 + self.comfort_data = None # 新添加的属性 def refresh_driver(self): try: @@ -217,7 +218,7 @@ class DataFetcher(object): # 移除分享链接 self.driver.execute_script("document.querySelectorAll('.shareline').forEach(element => element.remove());") ''' - # 使用JavaScript删除所有的
标签 + # 使用JavaScript除有的
标签 self.driver.execute_script(""" var elements = document.getElementsByTagName('dl'); while(elements.length > 0){ @@ -233,17 +234,22 @@ class DataFetcher(object): def check_verification_code(self): try: # 检查是否有验证码元素,如果有,则需要人工处理 - if (len(self.driver.find_elements(By.ID, "verification-code"))+len(self.driver.find_elements(By.CLASS_NAME, "alert-title"))): - print( - f'{time.strftime("%Y-%m-%d_%H-%M-%S")} check_verification_code:验证码被触发verification-code/alert-title,等待{crawal_interval*100}后重试。' + if (len(self.driver.find_elements(By.ID, "verification-code")) + len(self.driver.find_elements(By.CLASS_NAME, "alert-title"))): + print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} check_verification_code:验证码被触发verification-code/alert-title,请手动完成验证。') + + # 等待用户手动处理验证码 + input("请完成验证码,然后按回车键继续...") + + # 等待页面加载完成 + WebDriverWait(self.driver, max_wait_time).until( + EC.presence_of_element_located((By.CLASS_NAME, "pc_home-jipiao")) ) - self.driver.quit() - time.sleep(crawal_interval*100) - self.driver = init_driver() - self.err = 0 - self.switch_acc += 1 - self.get_page(1) - return False + + print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} check_verification_code:验证码处理完成,继续执行。') + + # 移除注意事项 + self.remove_btn() + return True else: # 移除注意事项 self.remove_btn() @@ -253,6 +259,7 @@ class DataFetcher(object): print( f'{time.strftime("%Y-%m-%d_%H-%M-%S")} check_verification_code:未知错误,错误类型:{type(e).__name__}, 详细错误信息:{str(e).split("Stacktrace:")[0]}' ) + return False def login(self): if login_allowed: @@ -647,7 +654,7 @@ class DataFetcher(object): ) print( - f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 错误次数【{self.err}-{max_retry_time}】,change_city:更换城市和日期失败,错误类型:{type(e).__name__}, 详细错误信息:{str(e).split("Stacktrace:")[0]}' + f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 错误次数【{self.err}-{max_retry_time}】,change_city:更换市和日期失败,错误类型:{type(e).__name__}, 详细错误信息:{str(e).split("Stacktrace:")[0]}' ) # 检查注意事项和验证码 @@ -689,11 +696,12 @@ class DataFetcher(object): self.predata = self.driver.wait_for_request( "/international/search/api/search/batchSearch?.*", timeout=max_wait_time ) - - rb = dict(json.loads(self.predata.body).get("flightSegments")[0]) - # 捕获 getFlightComfort 数据 self.comfort_data = self.capture_flight_comfort_data() + + rb = dict(json.loads(self.predata.body).get("flightSegments")[0]) + + except Exception as e: # 错误次数+1 @@ -761,7 +769,7 @@ class DataFetcher(object): f'screenshot/screenshot_{time.strftime("%Y-%m-%d_%H-%M-%S")}.png' ) - # 重新更换城市 + # 重新更换城 print( f'{time.strftime("%Y-%m-%d_%H-%M-%S")} get_data:重新更换城市:{rb["departureCityName"]}-{rb["arrivalCityName"]}-{rb["departureDate"]}' ) @@ -813,7 +821,7 @@ class DataFetcher(object): # 检查注意事项和验证码 if self.check_verification_code(): - # 重试 + # 试 self.get_data() # 判错误次数 if self.err >= max_retry_time: @@ -830,7 +838,7 @@ class DataFetcher(object): # 重置错误计数 self.err = 0 - # 若无错误,执行下一步 + # 若无误,执行下一步 self.check_data() def check_data(self): @@ -1041,9 +1049,43 @@ class DataFetcher(object): def mergedata(self): try: self.df = self.flights.merge(self.prices, on=["flightNo"]) + print(f"合并后的航班数据形状: {self.df.shape}") + print(f"合并后的航班数据列: {self.df.columns}") self.df["dateGetTime"] = dt.now().strftime("%Y-%m-%d") + print(f"获取到的舒适度数据: {self.comfort_data}") + + if self.comfort_data: + comfort_df = pd.DataFrame.from_dict(self.comfort_data, orient='index') + comfort_df.reset_index(inplace=True) + comfort_df.rename(columns={'index': 'flight_no'}, inplace=True) + + print(f"舒适度数据形状: {comfort_df.shape}") + print(f"舒适度数据列: {comfort_df.columns}") + print(f"舒适度数据前几行: \n{comfort_df.head()}") + + # 检查 operateFlightNo 列是否存在 + if 'operateFlightNo' in self.df.columns: + print(f"合并前的 operateFlightNo 唯一值: {self.df['operateFlightNo'].unique()}") + else: + print("警告: operateFlightNo 列不存在于数据中") + print(f"现有的列: {self.df.columns}") + + print(f"合并前的 flight_no 唯一值: {comfort_df['flight_no'].unique()}") + + # 创建一个临时列来存储用于匹配的航班号 + self.df['match_flight_no'] = self.df['operateFlightNo'].fillna(self.df['flightNo']) + + # 使用 left join 来合并数据 + self.df = self.df.merge(comfort_df, left_on='match_flight_no', right_on='flight_no', how='left') + + print(f"合并后的数据形状: {self.df.shape}") + print(f"合并后的数据列: {self.df.columns}") + + # 删除临时列和多余的flight_no列 + self.df.drop(['match_flight_no', 'flight_no'], axis=1, inplace=True) + if rename_col: # 对pandas的columns进行重命名 order = [ @@ -1096,15 +1138,33 @@ class DataFetcher(object): columns = dict(zip(origin, order)) + # 添加舒适度数据的列名映射 + comfort_columns = { + 'departure_delay_time': '出发延误时间', + 'departure_bridge_rate': '出发廊桥率', + 'arrival_delay_time': '到达延误时间', + 'plane_type': '飞机类型', + 'plane_width': '飞机宽度', + 'plane_age': '飞机机龄', + 'Y_has_meal': '经济舱是否有餐食', + 'Y_seat_tilt': '经济舱座椅倾斜度', + 'Y_seat_width': '经济舱座椅宽度', + 'Y_seat_pitch': '经济舱座椅间距', + 'Y_meal_msg': '经济舱餐食信息', + 'Y_power': '经济舱电源', + 'C_has_meal': '商务舱是否有餐食', + 'C_seat_tilt': '商务舱座椅倾斜度', + 'C_seat_width': '商务舱座椅宽度', + 'C_seat_pitch': '商务舱座椅间距', + 'C_meal_msg': '商务舱餐食信息', + 'C_power': '商务舱电源', + } + columns.update(comfort_columns) + self.df = self.df.rename(columns=columns) if del_info: - self.df = self.df[order] - - # 如果有 comfort_data,将其添加到数据框中 - if hasattr(self, 'comfort_data') and self.comfort_data: - comfort_df = pd.DataFrame(self.comfort_data) - self.df = pd.concat([self.df, comfort_df], axis=1) + self.df = self.df[order + list(comfort_columns.values())] files_dir = os.path.join( os.getcwd(), self.date, dt.now().strftime("%Y-%m-%d") @@ -1123,77 +1183,90 @@ class DataFetcher(object): return 0 except Exception as e: - print(f"合并数据失败 {str(e).split('Stacktrace:')[0]}") + print(f"合并数据失败 {str(e)}") + print(f"错误类型: {type(e).__name__}") + print(f"错误详情: {str(e)}") + import traceback + print(f"错误堆栈: {traceback.format_exc()}") return 0 def capture_flight_comfort_data(self): try: - # 等待并捕获所有 getFlightComfort 请求 + # 滚动页面到底部以加载所有内容 + last_height = self.driver.execute_script("return document.body.scrollHeight") + while True: + # 分步滚动页面 + for i in range(10): # 将页面分成10步滚动 + scroll_height = last_height * (i + 1) / 3 + self.driver.execute_script(f"window.scrollTo(0, {scroll_height});") + time.sleep(0.5) # 每一小步等待0.5秒 + + # 等待页面加载 + time.sleep(3) # 滚动到底部后多等待3秒 + + # 计算新的滚动高度并与最后的滚动高度进行比较 + new_height = self.driver.execute_script("return document.body.scrollHeight") + if new_height == last_height: + break + last_height = new_height + comfort_requests = self.driver.requests - comfort_data = [] + comfort_data = {} + batch_comfort_found = False + getFlightComfort_requests_count = 0 + total_requests_count = len(comfort_requests) + + print(f"\n{time.strftime('%Y-%m-%d_%H-%M-%S')} 开始分析请求,总请求数:{total_requests_count}") for request in comfort_requests: - if "/international/search/api/flight/comfort/getFlightComfort" in request.url: - print(f"\n{time.strftime('%Y-%m-%d_%H-%M-%S')} 捕获到 getFlightComfort 请求:") + if "/search/api/flight/comfort/batchGetComfortTagList" in request.url: + batch_comfort_found = True + print(f"{time.strftime('%Y-%m-%d_%H-%M-%S')} 找到 batchGetComfortTagList 请求") + continue + + if "/search/api/flight/comfort/getFlightComfort" in request.url: + getFlightComfort_requests_count += 1 + print(f"\n{time.strftime('%Y-%m-%d_%H-%M-%S')} 捕获到第 {getFlightComfort_requests_count} 个 getFlightComfort 请求:") print(f"URL: {request.url}") - print(f"Method: {request.method}") - print(f"Headers: {request.headers}") - # 提取请求 payload 中的航班号 try: payload = json.loads(request.body.decode('utf-8')) flight_no = payload.get('flightNoList', ['Unknown'])[0] - print(f"Flight Number: {flight_no}") + print(f"请求的航班号: {flight_no}") except Exception as e: print(f"无法解析请求 payload: {str(e)}") - flight_no = 'Unknown' + continue if request.response: - print(f"\n{time.strftime('%Y-%m-%d_%H-%M-%S')} getFlightComfort 响应:") - print(f"Status Code: {request.response.status_code}") - print(f"Headers: {request.response.headers}") - - # 检查是否是gzip压缩的响应 - content_encoding = request.response.headers.get('Content-Encoding', '').lower() + print(f"响应状态码: {request.response.status_code}") body = request.response.body - if content_encoding == 'gzip': + if request.response.headers.get('Content-Encoding', '').lower() == 'gzip': body = gzip.decompress(body) try: - body_text = body.decode('utf-8') - print(f"\n响应体内容:\n{body_text[:1000]}...") # 打印前1000个字符 - - json_data = json.loads(body_text) + json_data = json.loads(body.decode('utf-8')) + print(f"响应数据: {json.dumps(json_data, indent=2, ensure_ascii=False)[:500]}...") # 打印前500个字符 if json_data['status'] == 0 and json_data['msg'] == 'success': flight_comfort = json_data['data'] - # 提取准点率信息 punctuality = flight_comfort['punctualityInfo'] - - # 提取飞机信息 plane_info = flight_comfort['planeInfo'] - - # 提取舱位信息 cabin_info = {cabin['cabin']: cabin for cabin in flight_comfort['cabinInfoList']} processed_data = { - 'flight_no': flight_no, 'departure_delay_time': punctuality['departureDelaytime'], 'departure_bridge_rate': punctuality['departureBridge'], 'arrival_delay_time': punctuality['arrivalDelaytime'], - 'arrival_bridge_rate': punctuality['arrivalBridge'], 'plane_type': plane_info['planeTypeName'], 'plane_width': plane_info['planeWidthCategory'], 'plane_age': plane_info['planeAge'] } - # 添加经济舱和商务舱信息 for cabin_type in ['Y', 'C']: if cabin_type in cabin_info: cabin = cabin_info[cabin_type] processed_data.update({ f'{cabin_type}_has_meal': cabin['hasMeal'], - f'{cabin_type}_entertain_equipment': cabin['entertainEquipment'], f'{cabin_type}_seat_tilt': cabin['seatTilt']['value'], f'{cabin_type}_seat_width': cabin['seatWidth']['value'], f'{cabin_type}_seat_pitch': cabin['seatPitch']['value'], @@ -1202,34 +1275,43 @@ class DataFetcher(object): if 'power' in cabin: processed_data[f'{cabin_type}_power'] = cabin['power'] - comfort_data.append(processed_data) + comfort_data[flight_no] = processed_data print(f"{time.strftime('%Y-%m-%d_%H-%M-%S')} 成功提取航班 {flight_no} 的舒适度数据") else: print(f"{time.strftime('%Y-%m-%d_%H-%M-%S')} getFlightComfort 响应状态异常: {json_data['status']}, {json_data['msg']}") - except json.JSONDecodeError as je: - print(f"{time.strftime('%Y-%m-%d_%H-%M-%S')} 无法解析 getFlightComfort 响应的 JSON 数据: {str(je)}") - except UnicodeDecodeError as ude: - print(f"{time.strftime('%Y-%m-%d_%H-%M-%S')} 无法解码响应体: {str(ude)}") - print(f"原始响应体 (前100字节): {body[:100]}") + except Exception as e: + print(f"{time.strftime('%Y-%m-%d_%H-%M-%S')} 处理 getFlightComfort 响应时出错: {str(e)}") else: print(f"{time.strftime('%Y-%m-%d_%H-%M-%S')} getFlightComfort 请求没有响应") + print(f"\n{time.strftime('%Y-%m-%d_%H-%M-%S')} 请求分析完成") + print(f"总请求数: {total_requests_count}") + print(f"batchGetComfortTagList 请求是否找到: {batch_comfort_found}") + print(f"getFlightComfort 请求数: {getFlightComfort_requests_count}") + print(f"成功提取的舒适度数据数: {len(comfort_data)}") + if comfort_data: - # 创建 DataFrame - df = pd.DataFrame(comfort_data) + # 创建舒适度DataFrame + comfort_df = pd.DataFrame.from_dict(comfort_data, orient='index') + comfort_df.reset_index(inplace=True) + comfort_df.rename(columns={'index': 'flight_no'}, inplace=True) - # 创建保存目录 + # 保存舒适度数据为CSV文件 save_dir = os.path.join(os.getcwd(), self.date, datetime.now().strftime("%Y-%m-%d")) os.makedirs(save_dir, exist_ok=True) - # 保存为 CSV 文件 - filename = os.path.join(save_dir, f"{self.city[0]}-{self.city[1]}_comfort.csv") - df.to_csv(filename, encoding="UTF-8", index=False) - print(f"{time.strftime('%Y-%m-%d_%H-%M-%S')} 航班舒适度数据已保存到 {filename}") + comfort_filename = os.path.join(save_dir, f"{self.city[0]}-{self.city[1]}_comfort.csv") + comfort_df.to_csv(comfort_filename, encoding="UTF-8", index=False) + print(f"{time.strftime('%Y-%m-%d_%H-%M-%S')} 航班舒适度数据已保存到 {comfort_filename}") return comfort_data else: print(f"{time.strftime('%Y-%m-%d_%H-%M-%S')} 未捕获到任何 getFlightComfort 数据") + print("可能的原因:") + print("1. 网页没有加载完全") + print("2. 网站结构可能已经改变") + print("3. 网络连接问题") + print("4. 请求被网站拦截或限制") return None except Exception as e: