merge without operateFlightNo

9 months ago · 1c8a00fbbb
parent d4e76dba28
commit 1c8a00fbbb
1 changed files with 50 additions and 28 deletions
--- a/ctrip_flights_scraper_V3.py
+++ b/ctrip_flights_scraper_V3.py
@ -16,13 +16,13 @@ from datetime import datetime


 # 爬取的城市
-crawal_citys = ["天津",  "贵阳"]
+crawal_citys = ["天津",  "泉州"]

 # 爬取日期范围：起始日期。格式'2023-12-01'
-begin_date = "2024-10-21"
+begin_date = "2024-10-22"

 # 爬取日期范围：结束日期。格式'2023-12-31'
-end_date = "2024-10-22"
+end_date = "2024-11-01"

 # 爬取T+N，即N天后
 start_interval = 1
@ -79,8 +79,7 @@ def download_stealth_js(file_path, url='https://raw.githubusercontent.com/requir
        print(f"{file_path} already exists, no need to download.")

 def init_driver():
-    # options = webdriver.ChromeOptions() # 创建一个配置对象
-    options = webdriver.EdgeOptions()  # 创建一个配置对象
+    options = webdriver.ChromeOptions()  # 改为ChromeOptions
    options.add_argument("--incognito")  # 隐身模式（无痕模式）
    # options.add_argument('--headless')  # 启用无头模式
    options.add_argument("--no-sandbox")
@ -96,10 +95,11 @@ def init_driver():
    options.add_argument("--ignore-certificate-errors-spki-list")
    options.add_argument("--ignore-ssl-errors")
    options.add_experimental_option("excludeSwitches", ["enable-automation"])  # 不显示正在受自动化软件控制的提示
-    # chromeDriverPath = 'C:/Program Files/Google/Chrome/Application/chromedriver' #chromedriver位置
-    # options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.69")
-    # driver = webdriver.Chrome(executable_path=self.chromeDriverPath,chrome_options=self.options)
-    driver = webdriver.Edge(options=options)
+    
+    # 如果需要指定Chrome驱动的路径，取消下面这行的注释并设置正确的路径
+    # chromedriver_path = '/path/to/chromedriver'
+    
+    driver = webdriver.Chrome(options=options)  # 改为Chrome，如果需要指定路径，可以加上executable_path参数
    
    try:
        download_stealth_js(stealth_js_path)
@ -316,7 +316,7 @@ class DataFetcher(object):
                    
                if self.err < max_retry_time:
                    # 刷新页面
-                    print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} login：刷新页')
+                    print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} login：刷新页面')
                    self.refresh_driver()
                    # 检查注意事项和验证码
                    if self.check_verification_code():
@ -332,16 +332,27 @@ class DataFetcher(object):
        next_stage_flag = False
        try:
            if reset_to_homepage == 1:
+                print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 尝试前往首页...')
+                start_time = time.time()
                # 前往首页
                self.driver.get(
                    "https://flights.ctrip.com/online/channel/domestic")
+                end_time = time.time()
+                print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 前往首页耗时: {end_time - start_time:.2f} 秒')
+
+            print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 当前页面 URL: {self.driver.current_url}')
+            print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 当前页面标题: {self.driver.title}')

            # 检查注意事项和验证码
            if self.check_verification_code():
+                print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 等待页面加载完成...')
                WebDriverWait(self.driver, max_wait_time).until(
                    EC.presence_of_element_located(
                        (By.CLASS_NAME, "pc_home-jipiao"))
                )
+                print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 页面加载完成')
+
+                print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 尝试点击飞机图标...')
                # 点击飞机图标，返回主界面
                ele = WebDriverWait(self.driver, max_wait_time).until(
                    element_to_be_clickable(
@ -350,7 +361,9 @@ class DataFetcher(object):
                    )
                )
                ele.click()
+                print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 成功点击飞机图标')

+                print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 尝试选择单程...')
                # 单程
                ele = WebDriverWait(self.driver, max_wait_time).until(
                    element_to_be_clickable(
@ -359,7 +372,9 @@ class DataFetcher(object):
                    )
                )
                ele.click()
+                print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 成功选择单程')

+                print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 尝试点击搜索按钮...')
                # 搜索
                ele = WebDriverWait(self.driver, max_wait_time).until(
                    element_to_be_clickable(
@ -367,6 +382,7 @@ class DataFetcher(object):
                    )
                )
                ele.click()
+                print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 成功点击搜索按钮')

                next_stage_flag = True
        except Exception as e:
@ -374,20 +390,26 @@ class DataFetcher(object):
            print(
                f'{time.strftime("%Y-%m-%d_%H-%M-%S")} get_page：页面加载或元素操作失败，错误类型：{type(e).__name__}, 详细错误信息：{str(e).split("Stacktrace:")[0]}'
            )
+            print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 当前页面 URL: {self.driver.current_url}')
+            print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 当前页面标题: {self.driver.title}')
+            print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 当前页面源代码: {self.driver.page_source[:500]}...')  # 只打印前500个字符

            # 保存错误截图
            if enable_screenshot:
-                self.driver.save_screenshot(
-                    f'screenshot/screenshot_{time.strftime("%Y-%m-%d_%H-%M-%S")}.png'
-                )
+                screenshot_path = f'screenshot/screenshot_{time.strftime("%Y-%m-%d_%H-%M-%S")}.png'
+                self.driver.save_screenshot(screenshot_path)
+                print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 错误截图已保存: {screenshot_path}')

-            # 重新尝试加载页面，这指定需要重向到首页
+            # 重新尝试加载页面，这次指定需要重定向到首页
+            print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 重新尝试加载页面，这次指定需要重定向到首页')
            self.get_page(1)
        else:
            if next_stage_flag:
                # 继续下一步
+                print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 页面加载成功，继续下一步')
                self.change_city()
-
+            else:
+                print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 页面加载成功，但未能完成所有操作')
    def change_city(self):
        next_stage_flag = False
        try:
@ -733,14 +755,13 @@ class DataFetcher(object):
            # 判断错误次数
            if self.err >= max_retry_time:
                print(
-                    f'{time.strftime("%Y-%m-%d_%H-%M-%S")} <EFBFBD><EFBFBD><EFBFBD>误次数【{self.err}-{max_retry_time}】,get_data:重新尝试加载页面，这次指定需要重定向到首页'
+                    f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 误次数【{self.err}-{max_retry_time}】,get_data:重新尝试加载页面，这次指定需要重定向到首页'
                )

                # 重置错误计数
                self.err = 0
                # 重新尝试加载页面，这次指定需要重定向到首页
                self.get_page(1)
-
        else:
            # 删除本次请求
            del self.driver.requests
@ -1074,15 +1095,15 @@ class DataFetcher(object):
                # 检查 operateFlightNo 列是否存在
                if 'operateFlightNo' in self.df.columns:
                    print(f"合并前的 operateFlightNo 唯一值: {self.df['operateFlightNo'].unique()}")
+                    # 创建一个临时列来存储用于匹配的航班号
+                    self.df['match_flight_no'] = self.df['operateFlightNo'].fillna(self.df['flightNo'])
                else:
-                    print("警告: operateFlightNo 列不存在于数据中")
-                    print(f"现有的列: {self.df.columns}")
+                    print("警告: operateFlightNo 列不存在于数据中,将使用 flightNo 进行匹配")
+                    self.df['match_flight_no'] = self.df['flightNo']
                
+                print(f"现有的列: {self.df.columns}")
                print(f"合并前的 flight_no 唯一值: {comfort_df['flight_no'].unique()}")
                
-                # 创建一个临时列来存储用于匹配的航班号
-                self.df['match_flight_no'] = self.df['operateFlightNo'].fillna(self.df['flightNo'])
-                
                # 使用 left join 来合并数据
                self.df = self.df.merge(comfort_df, left_on='match_flight_no', right_on='flight_no', how='left')
                
@ -1090,7 +1111,8 @@ class DataFetcher(object):
                print(f"合并后的数据列: {self.df.columns}")
                
                # 删除临时列和多余的flight_no列
-                self.df.drop(['match_flight_no', 'flight_no'], axis=1, inplace=True)
+                self.df.drop(['match_flight_no', 'flight_no'], axis=1, inplace=True, errors='ignore')
+

            if rename_col:
                # 对pandas的columns进行重命名
@ -1305,12 +1327,12 @@ class DataFetcher(object):
                comfort_df.rename(columns={'index': 'flight_no'}, inplace=True)
                
                # 保存舒适度数据为CSV文件
-                save_dir = os.path.join(os.getcwd(), self.date, datetime.now().strftime("%Y-%m-%d"))
-                os.makedirs(save_dir, exist_ok=True)
+                # save_dir = os.path.join(os.getcwd(), self.date, datetime.now().strftime("%Y-%m-%d"))
+                # os.makedirs(save_dir, exist_ok=True)
                
-                comfort_filename = os.path.join(save_dir, f"{self.city[0]}-{self.city[1]}_comfort.csv")
-                comfort_df.to_csv(comfort_filename, encoding="UTF-8", index=False)
-                print(f"{time.strftime('%Y-%m-%d_%H-%M-%S')} 航班舒适度数据已保存到 {comfort_filename}")
+                # comfort_filename = os.path.join(save_dir, f"{self.city[0]}-{self.city[1]}_comfort.csv")
+                # comfort_df.to_csv(comfort_filename, encoding="UTF-8", index=False)
+                # print(f"{time.strftime('%Y-%m-%d_%H-%M-%S')} 航班舒适度数据已保存到 {comfort_filename}")
                
                return comfort_data
            else: