From 948df89ba51a4abbc680277cc1b04906f584186f Mon Sep 17 00:00:00 2001
From: sgqt <1036203584@qq.com>
Date: Wed, 6 Nov 2024 09:36:48 +0800
Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E6=B3=A8=E9=87=8A?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
src/clawer/csv_to_xlsx_converter.py | 73 ++
src/clawer/ctrip_flights_scraper_V3.py | 1420 ++++++++++++++++++++++++
src/clawer/db_import.py | 90 ++
3 files changed, 1583 insertions(+)
create mode 100644 src/clawer/csv_to_xlsx_converter.py
create mode 100644 src/clawer/ctrip_flights_scraper_V3.py
create mode 100644 src/clawer/db_import.py
diff --git a/src/clawer/csv_to_xlsx_converter.py b/src/clawer/csv_to_xlsx_converter.py
new file mode 100644
index 0000000..c1c3581
--- /dev/null
+++ b/src/clawer/csv_to_xlsx_converter.py
@@ -0,0 +1,73 @@
+import pandas as pd
+import os
+from datetime import datetime, timedelta
+
+def get_departure_destination(file_name):
+ name_without_extension = os.path.splitext(file_name)[0]
+ return name_without_extension
+
+def merge_csv_files(csv_files, output_xlsx):
+ all_dfs = []
+ for csv_file in csv_files:
+ df = pd.read_csv(csv_file)
+ # 添加日期列
+ date = os.path.basename(os.path.dirname(os.path.dirname(csv_file)))
+ df['出发日期'] = date
+
+ # 选择指定的列
+ selected_columns = [
+ '航班号', '航空公司', '出发日期', '出发时间', '到达时间',
+ '中转信息', 'economy_origin', '经济舱餐食信息', '经济舱座椅间距', '出发延误时间'
+ ]
+ df = df[selected_columns]
+
+ # 重命名 'economy_origin' 为 '票价'
+ df = df.rename(columns={'economy_origin': '票价'})
+
+ all_dfs.append(df)
+
+ # 合并所有数据框
+ merged_df = pd.concat(all_dfs, ignore_index=True)
+
+ # 保存为Excel文件
+ merged_df.to_excel(output_xlsx, index=False, engine='openpyxl')
+
+# 设置日期范围
+start_date = datetime(2024, 10, 22)
+end_date = datetime(2024, 11, 1)
+
+# 设置输入和输出文件夹路径
+input_base_path = "D:\\college\\SE2\\Ctrip-Crawler-main\\Ctrip-Crawler-main"
+output_folder = "D:\\college\\SE2\\Ctrip-Crawler-main\\Ctrip-Crawler-main\\xlsx_output"
+
+# 确保输出文件夹存在
+if not os.path.exists(output_folder):
+ os.makedirs(output_folder)
+
+# 用于存储同一始发地和目的地的CSV文件
+route_files = {}
+
+current_date = start_date
+while current_date <= end_date:
+ folder_name = current_date.strftime("%Y-%m-%d")
+ folder_path = os.path.join(input_base_path, folder_name, "2024-10-22")
+
+ if os.path.exists(folder_path):
+ for file_name in os.listdir(folder_path):
+ if file_name.endswith('.csv'):
+ csv_path = os.path.join(folder_path, file_name)
+ route = get_departure_destination(file_name)
+
+ if route not in route_files:
+ route_files[route] = []
+ route_files[route].append(csv_path)
+
+ current_date += timedelta(days=1)
+
+# 合并并保存每个路线的文件
+for route, files in route_files.items():
+ output_xlsx = os.path.join(output_folder, f"{route}.xlsx")
+ merge_csv_files(files, output_xlsx)
+ print(f"已合并并保存路线: {route} -> {output_xlsx}")
+
+print("所有CSV文件已成功合并为XLSX文件,并筛选了指定的列")
diff --git a/src/clawer/ctrip_flights_scraper_V3.py b/src/clawer/ctrip_flights_scraper_V3.py
new file mode 100644
index 0000000..ff0e93f
--- /dev/null
+++ b/src/clawer/ctrip_flights_scraper_V3.py
@@ -0,0 +1,1420 @@
+import magic
+import io
+import os
+import gzip
+import time
+import json
+import requests
+import pandas as pd
+from seleniumwire import webdriver
+from datetime import datetime as dt, timedelta
+from selenium.webdriver.common.by import By
+from selenium.webdriver.common.keys import Keys
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.support.ui import WebDriverWait
+from datetime import datetime
+
+# 爬取的城市
+crawal_citys = ["杭州", "天津"]
+
+# 爬取日期范围:起始日期。格式'2023-12-01'
+begin_date = "2024-11-6"
+
+# 爬取日期范围:结束日期。格式'2023-12-31'
+end_date = "2024-11-6"
+
+# 爬取T+N,即N天后
+start_interval = 1
+
+# 爬取的日期
+crawal_days = 60
+
+# 设置各城市爬取的时间间隔(单位:秒)
+crawal_interval = 5
+
+# 日期间隔
+days_interval = 1
+
+# 设置页面加载的最长等待时间(单位:秒)
+max_wait_time = 10
+
+# 最大错误重试次数
+max_retry_time = 5
+
+# 是否只抓取直飞信息(True: 只抓取直飞,False: 抓取所有航班)
+direct_flight = False
+
+# 是否删除不重要的信息
+del_info = False
+
+# 是否重命DataFrame的列名
+rename_col = True
+
+# 调试截图
+enable_screenshot = False
+
+# 允许登录(可能必须要登录才能获取数据)
+login_allowed = True
+
+# 账号
+accounts = ['', '']
+
+# 密码
+passwords = ['', '']
+
+# 利用stealth.min.js隐藏selenium特征
+stealth_js_path = './stealth.min.js'
+
+
+# 定义下载stealth.min.js的函数
+def download_stealth_js(file_path,
+ url='https://raw.githubusercontent.com/requireCool/stealth.min.js/main/stealth.min.js'):
+ if not os.path.exists(file_path):
+ print(f"{file_path} not found, downloading...")
+ response = requests.get(url)
+ response.raise_for_status() # 确保请求成功
+ with open(file_path, 'w') as file:
+ file.write(response.text)
+ print(f"{file_path} downloaded.")
+ else:
+ print(f"{file_path} already exists, no need to download.")
+
+
+def init_driver():
+ options = webdriver.ChromeOptions() # 改为ChromeOptions
+ options.add_argument("--incognito") # 隐身模式(无痕模式)
+ # options.add_argument('--headless') # 启用无头模式
+ options.add_argument("--no-sandbox")
+ options.add_argument("--disable-dev-shm-usage")
+ options.add_argument("--disable-blink-features")
+ options.add_argument("--disable-blink-features=AutomationControlled")
+ options.add_argument("--disable-extensions")
+ options.add_argument("--pageLoadStrategy=eager")
+ options.add_argument("--disable-gpu")
+ options.add_argument("--disable-software-rasterizer")
+ options.add_argument("--disable-dev-shm-usage")
+ options.add_argument("--ignore-certificate-errors")
+ options.add_argument("--ignore-certificate-errors-spki-list")
+ options.add_argument("--ignore-ssl-errors")
+ options.add_experimental_option("excludeSwitches", ["enable-automation"]) # 不显示正在受自动化软件控制的提示
+
+ # 如果需要指定Chrome驱动的路径,取消下面这行的注释并设置正确的路径
+ # chromedriver_path = '/path/to/chromedriver'
+
+ driver = webdriver.Chrome(options=options) # 改为Chrome,如果需要指定路径,可以加上executable_path参数
+
+ try:
+ download_stealth_js(stealth_js_path)
+ # 读取并注入stealth.min.js
+ with open(stealth_js_path, 'r') as file:
+ stealth_js = file.read()
+ driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {"source": stealth_js})
+ except Exception as e:
+ print(e)
+
+ driver.maximize_window()
+
+ return driver
+
+
+def gen_citys(crawal_citys):
+ # 生成城市组合表
+ citys = []
+ ytic = list(reversed(crawal_citys))
+ for m in crawal_citys:
+ for n in ytic:
+ if m == n:
+ continue
+ else:
+ citys.append([m, n])
+ return citys
+
+
+def generate_flight_dates(n, begin_date, end_date, start_interval, days_interval):
+ flight_dates = []
+
+ if begin_date:
+ begin_date = dt.strptime(begin_date, "%Y-%m-%d")
+ elif start_interval:
+ begin_date = dt.now() + timedelta(days=start_interval)
+
+ for i in range(0, n, days_interval):
+ flight_date = begin_date + timedelta(days=i)
+
+ flight_dates.append(flight_date.strftime("%Y-%m-%d"))
+
+ # 如果有结束日期,确保生成的日期不超过结束日期
+ if end_date:
+ end_date = dt.strptime(end_date, "%Y-%m-%d")
+ flight_dates = [date for date in flight_dates if dt.strptime(date, "%Y-%m-%d") <= end_date]
+ # 继续生成日期直到达到或超过结束日期
+ while dt.strptime(flight_dates[-1], "%Y-%m-%d") < end_date:
+ next_date = dt.strptime(flight_dates[-1], "%Y-%m-%d") + timedelta(days=days_interval)
+ if next_date <= end_date:
+ flight_dates.append(next_date.strftime("%Y-%m-%d"))
+ else:
+ break
+
+ return flight_dates
+
+
+# element_to_be_clickable 函数来替代 expected_conditions.element_to_be_clickable 或 expected_conditions.visibility_of_element_located
+
+
+def element_to_be_clickable(element):
+ def check_clickable(driver):
+ try:
+ if element.is_enabled() and element.is_displayed():
+ return element # 当条件满足时,返回元素本身
+ else:
+ return False
+ except:
+ return False
+
+ return check_clickable
+
+
+class DataFetcher(object):
+ def __init__(self, driver):
+ self.driver = driver
+ self.date = None
+ self.city = None
+ self.err = 0 # 错误重试次数
+ self.switch_acc = 0 # 切换账户
+ self.comfort_data = None # 新添加的属性
+
+ def refresh_driver(self):
+ try:
+ self.driver.refresh()
+ except Exception as e:
+ # 错误次数+1
+ self.err += 1
+
+ print(
+ f'{time.strftime("%Y-%m-%d_%H-%M-%S")} refresh_driver:刷新页面失败,错误类型:{type(e).__name__}, 详细错误信息:{str(e).split("Stacktrace:")[0]}'
+ )
+
+ # 保存错误截图
+ if enable_screenshot:
+ self.driver.save_screenshot(
+ f'screenshot/screenshot_{time.strftime("%Y-%m-%d_%H-%M-%S")}.png'
+ )
+ if self.err < max_retry_time:
+ # 刷新页面
+ print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} refresh_driver:刷新页面')
+ self.refresh_driver()
+
+ # 判断错误次数
+ if self.err >= max_retry_time:
+ print(
+ f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 错误次数【{self.err}-{max_retry_time}】,refresh_driver:不继续重试'
+ )
+
+ def remove_btn(self):
+ try:
+ # WebDriverWait(self.driver, max_wait_time).until(lambda d: d.execute_script('return typeof jQuery !== "undefined"'))
+ # 移除提醒
+ self.driver.execute_script("document.querySelectorAll('.notice-box').forEach(element => element.remove());")
+ # 移除在线客服
+ self.driver.execute_script(
+ "document.querySelectorAll('.shortcut, .shortcut-link').forEach(element => element.remove());")
+ # 移除分享链接
+ self.driver.execute_script("document.querySelectorAll('.shareline').forEach(element => element.remove());")
+ '''
+ # 使用JavaScript除有的
标签
+ self.driver.execute_script("""
+ var elements = document.getElementsByTagName('dl');
+ while(elements.length > 0){
+ elements[0].parentNode.removeChild(elements[0]);
+ }
+ """)
+ '''
+ except Exception as e:
+ print(
+ f'{time.strftime("%Y-%m-%d_%H-%M-%S")} remove_btn:提醒移除失败,错误类型:{type(e).__name__}, 详细错误信息:{str(e).split("Stacktrace:")[0]}'
+ )
+
+ def check_verification_code(self):
+ try:
+ # 检查是否有验证码元素,如果有,则需要人工处理
+ if (len(self.driver.find_elements(By.ID, "verification-code")) + len(
+ self.driver.find_elements(By.CLASS_NAME, "alert-title"))):
+ print(
+ f'{time.strftime("%Y-%m-%d_%H-%M-%S")} check_verification_code:验证码被触发verification-code/alert-title,请手动完成验证。')
+
+ # 等待用户手动处理验证码
+ input("请完成验证码,然后按回车键继续...")
+
+ # 等待页面加载完成
+ WebDriverWait(self.driver, max_wait_time).until(
+ EC.presence_of_element_located((By.CLASS_NAME, "pc_home-jipiao"))
+ )
+
+ print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} check_verification_code:验证码处理完成,继续执行。')
+
+ # 移除注意事项
+ self.remove_btn()
+ return True
+ else:
+ # 移除注意事项
+ self.remove_btn()
+ # 如果没有找到验证码元素,则说明页面加载成功,没有触发验证码
+ return True
+ except Exception as e:
+ print(
+ f'{time.strftime("%Y-%m-%d_%H-%M-%S")} check_verification_code:未知错误,错误类型:{type(e).__name__}, 详细错误信息:{str(e).split("Stacktrace:")[0]}'
+ )
+ return False
+
+ def login(self):
+ if login_allowed:
+
+ account = accounts[self.switch_acc % len(accounts)]
+ password = passwords[self.switch_acc % len(passwords)]
+
+ try:
+ if len(self.driver.find_elements(By.CLASS_NAME, "lg_loginbox_modal")) == 0:
+ print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} login:未弹出登录界面')
+ WebDriverWait(self.driver, max_wait_time).until(
+ EC.presence_of_element_located((By.CLASS_NAME, "tl_nfes_home_header_login_wrapper_siwkn")))
+ # 点击飞机图标,返回主界面
+ ele = WebDriverWait(self.driver, max_wait_time).until(element_to_be_clickable(
+ self.driver.find_element(By.CLASS_NAME, "tl_nfes_home_header_login_wrapper_siwkn")))
+ ele.click()
+ # 等待页面加
+ WebDriverWait(self.driver, max_wait_time).until(
+ EC.presence_of_element_located((By.CLASS_NAME, "lg_loginwrap")))
+ else:
+ print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} login:已经弹出登录界面')
+
+ ele = WebDriverWait(self.driver, max_wait_time).until(element_to_be_clickable(
+ self.driver.find_elements(By.CLASS_NAME, "r_input.bbz-js-iconable-input")[0]))
+ ele.send_keys(account)
+ print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} login:输入账户成功')
+
+ ele = WebDriverWait(self.driver, max_wait_time).until(element_to_be_clickable(
+ self.driver.find_element(By.CSS_SELECTOR,
+ "div[data-testid='accountPanel'] input[data-testid='passwordInput']")))
+ ele.send_keys(password)
+ print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} login:输入密码成功')
+
+ ele = WebDriverWait(self.driver, max_wait_time).until(element_to_be_clickable(
+ self.driver.find_element(By.CSS_SELECTOR, '[for="checkboxAgreementInput"]')))
+ ele.click()
+ print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} login:勾选同意成功')
+
+ ele = WebDriverWait(self.driver, max_wait_time).until(
+ element_to_be_clickable(self.driver.find_elements(By.CLASS_NAME, "form_btn.form_btn--block")[0]))
+ ele.click()
+ print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} login:登录成功')
+ # 保存登录截图
+ if enable_screenshot:
+ self.driver.save_screenshot(
+ f'screenshot/screenshot_{time.strftime("%Y-%m-%d_%H-%M-%S")}.png'
+ )
+ time.sleep(crawal_interval * 3)
+ except Exception as e:
+ # 错误次数+1
+ self.err += 1
+ # 用f字符串格式化错误类型和错误信息,提供更多的调试信息
+ print(
+ f'{time.strftime("%Y-%m-%d_%H-%M-%S")} login:页面加载或元素操作失败,错误类型:{type(e).__name__}, 详细误信息:{str(e).split("Stacktrace:")[0]}'
+ )
+
+ # 保存错误截图
+ if enable_screenshot:
+ self.driver.save_screenshot(
+ f'screenshot/screenshot_{time.strftime("%Y-%m-%d_%H-%M-%S")}.png'
+ )
+
+ if self.err < max_retry_time:
+ # 刷新页面
+ print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} login:刷新页面')
+ self.refresh_driver()
+ # 检查注意事项和验证码
+ if self.check_verification_code():
+ # 重试
+ self.login()
+ # 判断错误次数
+ if self.err >= max_retry_time:
+ print(
+ f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 错误次数【{self.err}-{max_retry_time}】,login:重新尝试加载页面,这次指定需要重定向到首页'
+ )
+
+ def get_page(self, reset_to_homepage=0):
+ next_stage_flag = False
+ try:
+ if reset_to_homepage == 1:
+ print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 尝试前往首页...')
+ start_time = time.time()
+ # 前往首页
+ self.driver.get(
+ "https://flights.ctrip.com/online/channel/domestic")
+ end_time = time.time()
+ print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 前往首页耗时: {end_time - start_time:.2f} 秒')
+
+ print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 当前页面 URL: {self.driver.current_url}')
+ print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 当前页面标题: {self.driver.title}')
+
+ # 检查注意事项和验证码
+ if self.check_verification_code():
+ print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 等待页面加载完成...')
+ WebDriverWait(self.driver, max_wait_time).until(
+ EC.presence_of_element_located(
+ (By.CLASS_NAME, "pc_home-jipiao"))
+ )
+ print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 页面加载完成')
+
+ print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 尝试点击飞机图标...')
+ # 点击飞机图标,返回主界面
+ ele = WebDriverWait(self.driver, max_wait_time).until(
+ element_to_be_clickable(
+ self.driver.find_element(
+ By.CLASS_NAME, "pc_home-jipiao")
+ )
+ )
+ ele.click()
+ print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 成功点击飞机图标')
+
+ print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 尝试选择单程...')
+ # 单程
+ ele = WebDriverWait(self.driver, max_wait_time).until(
+ element_to_be_clickable(
+ self.driver.find_elements(
+ By.CLASS_NAME, "radio-label")[0]
+ )
+ )
+ ele.click()
+ print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 成功选择单程')
+
+ print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 尝试点击搜索按钮...')
+ # 搜索
+ ele = WebDriverWait(self.driver, max_wait_time).until(
+ element_to_be_clickable(
+ self.driver.find_element(By.CLASS_NAME, "search-btn")
+ )
+ )
+ ele.click()
+ print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 成功点击搜索按钮')
+
+ next_stage_flag = True
+ except Exception as e:
+ # 用f字符串格式化错误类型和错误信息,提供更多的调试信息
+ print(
+ f'{time.strftime("%Y-%m-%d_%H-%M-%S")} get_page:页面加载或元素操作失败,错误类型:{type(e).__name__}, 详细错误信息:{str(e).split("Stacktrace:")[0]}'
+ )
+ print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 当前页面 URL: {self.driver.current_url}')
+ print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 当前页面标题: {self.driver.title}')
+ print(
+ f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 当前页面源代码: {self.driver.page_source[:500]}...') # 只打印前500个字符
+
+ # 保存错误截图
+ if enable_screenshot:
+ screenshot_path = f'screenshot/screenshot_{time.strftime("%Y-%m-%d_%H-%M-%S")}.png'
+ self.driver.save_screenshot(screenshot_path)
+ print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 错误截图已保存: {screenshot_path}')
+
+ # 重新尝试加载页面,这次指定需要重定向到首页
+ print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 重新尝试加载页面,这次指定需要重定向到首页')
+ self.get_page(1)
+ else:
+ if next_stage_flag:
+ # 继续下一步
+ print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 页面加载成功,继续下一步')
+ self.change_city()
+ else:
+ print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 页面加载成功,但未能完成所有操作')
+
+ def change_city(self):
+ next_stage_flag = False
+ try:
+ # 等待页面完成加载
+ WebDriverWait(self.driver, max_wait_time).until(
+ EC.presence_of_element_located(
+ (By.CLASS_NAME, "form-input-v3"))
+ )
+
+ # 检查注意事项和验证码
+ if self.check_verification_code():
+ # 若出发地与目标值不符,则更改出发地
+ while self.city[0] not in self.driver.find_elements(
+ By.CLASS_NAME, "form-input-v3"
+ )[0].get_attribute("value"):
+ ele = WebDriverWait(self.driver, max_wait_time).until(
+ element_to_be_clickable(
+ self.driver.find_elements(
+ By.CLASS_NAME, "form-input-v3")[0]
+ )
+ )
+ ele.click()
+ ele = WebDriverWait(self.driver, max_wait_time).until(
+ element_to_be_clickable(
+ self.driver.find_elements(
+ By.CLASS_NAME, "form-input-v3")[0]
+ )
+ )
+ ele.send_keys(Keys.CONTROL + "a")
+ ele = WebDriverWait(self.driver, max_wait_time).until(
+ element_to_be_clickable(
+ self.driver.find_elements(
+ By.CLASS_NAME, "form-input-v3")[0]
+ )
+ )
+ ele.send_keys(self.city[0])
+
+ print(
+ f'{time.strftime("%Y-%m-%d_%H-%M-%S")} change_city:更换城市【0】-{self.driver.find_elements(By.CLASS_NAME, "form-input-v3")[0].get_attribute("value")}'
+ )
+
+ # 若目的地与目标值不符,则更改目的地
+ while self.city[1] not in self.driver.find_elements(
+ By.CLASS_NAME, "form-input-v3"
+ )[1].get_attribute("value"):
+ ele = WebDriverWait(self.driver, max_wait_time).until(
+ element_to_be_clickable(
+ self.driver.find_elements(
+ By.CLASS_NAME, "form-input-v3")[1]
+ )
+ )
+ ele.click()
+ ele = WebDriverWait(self.driver, max_wait_time).until(
+ element_to_be_clickable(
+ self.driver.find_elements(
+ By.CLASS_NAME, "form-input-v3")[1]
+ )
+ )
+ ele.send_keys(Keys.CONTROL + "a")
+ ele = WebDriverWait(self.driver, max_wait_time).until(
+ element_to_be_clickable(
+ self.driver.find_elements(
+ By.CLASS_NAME, "form-input-v3")[1]
+ )
+ )
+ ele.send_keys(self.city[1])
+
+ print(
+ f'{time.strftime("%Y-%m-%d_%H-%M-%S")} change_city:更换城市【1】-{self.driver.find_elements(By.CLASS_NAME, "form-input-v3")[1].get_attribute("value")}'
+ )
+
+ while (
+ self.driver.find_elements(By.CSS_SELECTOR, "[aria-label=请选择日期]")[
+ 0
+ ].get_attribute("value")
+ != self.date
+ ):
+ # 点击日期选择
+ ele = WebDriverWait(self.driver, max_wait_time).until(
+ element_to_be_clickable(
+ self.driver.find_element(
+ By.CLASS_NAME, "modifyDate.depart-date"
+ )
+ )
+ )
+ ele.click()
+
+ if int(
+ self.driver.find_elements(
+ By.CLASS_NAME, "date-picker.date-picker-block"
+ )[1]
+ .find_element(By.CLASS_NAME, "year")
+ .text[:-1]
+ ) < int(self.date[:4]):
+ ele = WebDriverWait(self.driver, max_wait_time).until(
+ element_to_be_clickable(
+ self.driver.find_elements(
+ By.CLASS_NAME,
+ "in-date-picker.icon.next-ico.iconf-right",
+ )[1]
+ )
+ )
+ print(
+ f'{time.strftime("%Y-%m-%d_%H-%M-%S")} change_city:更换日期{int(self.driver.find_elements(By.CLASS_NAME, "date-picker.date-picker-block")[1].find_element(By.CLASS_NAME, "year").text[:-1])}小于 {int(self.date[:4])} 向右点击'
+ )
+ ele.click()
+
+ if int(
+ self.driver.find_elements(
+ By.CLASS_NAME, "date-picker.date-picker-block"
+ )[0]
+ .find_element(By.CLASS_NAME, "year")
+ .text[:-1]
+ ) > int(self.date[:4]):
+ ele = WebDriverWait(self.driver, max_wait_time).until(
+ element_to_be_clickable(
+ self.driver.find_elements(
+ By.CLASS_NAME,
+ "in-date-picker.icon.prev-ico.iconf-left",
+ )[0]
+ )
+ )
+ print(
+ f'{time.strftime("%Y-%m-%d_%H-%M-%S")} change_city:更换日期{int(self.driver.find_elements(By.CLASS_NAME, "date-picker.date-picker-block")[0].find_element(By.CLASS_NAME, "year").text[:-1])}大于 {int(self.date[:4])} 向左点击'
+ )
+ ele.click()
+
+ if int(
+ self.driver.find_elements(
+ By.CLASS_NAME, "date-picker.date-picker-block"
+ )[0]
+ .find_element(By.CLASS_NAME, "year")
+ .text[:-1]
+ ) == int(self.date[:4]):
+ if int(
+ self.driver.find_elements(
+ By.CLASS_NAME, "date-picker.date-picker-block"
+ )[0]
+ .find_element(By.CLASS_NAME, "month")
+ .text[:-1]
+ ) > int(self.date[5:7]):
+ ele = WebDriverWait(self.driver, max_wait_time).until(
+ element_to_be_clickable(
+ self.driver.find_elements(
+ By.CLASS_NAME,
+ "in-date-picker.icon.prev-ico.iconf-left",
+ )[0]
+ )
+ )
+ print(
+ f'{time.strftime("%Y-%m-%d_%H-%M-%S")} change_city:更换日期{int(self.driver.find_elements(By.CLASS_NAME, "date-picker.date-picker-block")[0].find_element(By.CLASS_NAME, "month").text[:-1])}大于 {int(self.date[5:7])} 左点击'
+ )
+ ele.click()
+
+ if int(
+ self.driver.find_elements(
+ By.CLASS_NAME, "date-picker.date-picker-block"
+ )[1]
+ .find_element(By.CLASS_NAME, "year")
+ .text[:-1]
+ ) == int(self.date[:4]):
+ if int(
+ self.driver.find_elements(
+ By.CLASS_NAME, "date-picker.date-picker-block"
+ )[1]
+ .find_element(By.CLASS_NAME, "month")
+ .text[:-1]
+ ) < int(self.date[5:7]):
+ ele = WebDriverWait(self.driver, max_wait_time).until(
+ element_to_be_clickable(
+ self.driver.find_elements(
+ By.CLASS_NAME,
+ "in-date-picker.icon.next-ico.iconf-right",
+ )[1]
+ )
+ )
+ print(
+ f'{time.strftime("%Y-%m-%d_%H-%M-%S")} change_city:更换日期{int(self.driver.find_elements(By.CLASS_NAME, "date-picker.date-picker-block")[1].find_element(By.CLASS_NAME, "month").text[:-1])}小于 {int(self.date[5:7])} 向右点击'
+ )
+ ele.click()
+
+ for m in self.driver.find_elements(
+ By.CLASS_NAME, "date-picker.date-picker-block"
+ ):
+ if int(m.find_element(By.CLASS_NAME, "year").text[:-1]) != int(
+ self.date[:4]
+ ):
+ continue
+
+ if int(m.find_element(By.CLASS_NAME, "month").text[:-1]) != int(
+ self.date[5:7]
+ ):
+ continue
+
+ for d in m.find_elements(By.CLASS_NAME, "date-d"):
+ if int(d.text) == int(self.date[-2:]):
+ ele = WebDriverWait(self.driver, max_wait_time).until(
+ element_to_be_clickable(d)
+ )
+ ele.click()
+ break
+ print(
+ f'{time.strftime("%Y-%m-%d_%H-%M-%S")} change_city:更换日期-{self.driver.find_elements(By.CSS_SELECTOR, "[aria-label=请选择日期]")[0].get_attribute("value")}'
+ )
+
+ while "(" not in self.driver.find_elements(
+ By.CLASS_NAME, "form-input-v3"
+ )[0].get_attribute("value"):
+ # Enter搜索
+ # ele=WebDriverWait(self.driver, max_wait_time).until(element_to_be_clickable(its[1]))
+ # ele.send_keys(Keys.ENTER)
+ ele = WebDriverWait(self.driver, max_wait_time).until(
+ element_to_be_clickable(
+ self.driver.find_elements(
+ By.CLASS_NAME, "form-input-v3")[0]
+ )
+ )
+ ele.click()
+
+ # 通过低价提醒按钮实现enter键换页
+ ele = WebDriverWait(self.driver, max_wait_time).until(
+ element_to_be_clickable(
+ self.driver.find_elements(
+ By.CLASS_NAME, "low-price-remind"
+ )[0]
+ )
+ )
+ ele.click()
+
+ while "(" not in self.driver.find_elements(
+ By.CLASS_NAME, "form-input-v3"
+ )[1].get_attribute("value"):
+ # Enter搜索
+ # ele=WebDriverWait(self.driver, max_wait_time).until(element_to_be_clickable(its[1]))
+ # ele.send_keys(Keys.ENTER)
+ ele = WebDriverWait(self.driver, max_wait_time).until(
+ element_to_be_clickable(
+ self.driver.find_elements(
+ By.CLASS_NAME, "form-input-v3")[1]
+ )
+ )
+ ele.click()
+
+ # 通过低价提醒按钮实现enter键换页
+ ele = WebDriverWait(self.driver, max_wait_time).until(
+ element_to_be_clickable(
+ self.driver.find_elements(
+ By.CLASS_NAME, "low-price-remind"
+ )[0]
+ )
+ )
+ ele.click()
+
+ next_stage_flag = True
+
+ except Exception as e:
+ # 错误次数+1
+ self.err += 1
+
+ # 保存错误截图
+ if enable_screenshot:
+ self.driver.save_screenshot(
+ f'screenshot/screenshot_{time.strftime("%Y-%m-%d_%H-%M-%S")}.png'
+ )
+
+ print(
+ f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 错误次数【{self.err}-{max_retry_time}】,change_city:更换市和日期失败,错误类型:{type(e).__name__}, 详细错误信息:{str(e).split("Stacktrace:")[0]}'
+ )
+
+ # 检查注意事项和验证码
+ if self.check_verification_code():
+ if self.err < max_retry_time:
+ if len(self.driver.find_elements(By.CLASS_NAME, "lg_loginbox_modal")):
+ print(
+ f'{time.strftime("%Y-%m-%d_%H-%M-%S")} change_city:检测到登录弹窗,需要登录'
+ )
+ self.login()
+ # 重试
+ print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} change_city:重试')
+ self.change_city()
+ # 判断错误次数
+ if self.err >= max_retry_time:
+ print(
+ f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 错误次数【{self.err}-{max_retry_time}】,change_city:重新尝试加载页面,这次指定需要重定向到首页'
+ )
+
+ # 删除本次请求
+ del self.driver.requests
+
+ # 置错计数
+ self.err = 0
+
+ # 重新尝试加载页面,这次指定需要重定向到首页
+ self.get_page(1)
+ else:
+ if next_stage_flag:
+ # 若无错误,执行下一步
+ self.get_data()
+
+ print(
+ f'{time.strftime("%Y-%m-%d_%H-%M-%S")} change_city:成功更换城市和日期,当前路线为:{self.city[0]}-{self.city[1]}')
+
+ def get_data(self):
+ try:
+ # 等待响应加载完成
+ self.predata = self.driver.wait_for_request(
+ "/international/search/api/search/batchSearch?.*", timeout=max_wait_time
+ )
+ # 捕获 getFlightComfort 数据
+ self.comfort_data = self.capture_flight_comfort_data()
+
+ rb = dict(json.loads(self.predata.body).get("flightSegments")[0])
+
+
+
+ except Exception as e:
+ # 错误次数+1
+ self.err += 1
+
+ print(
+ f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 错误次数【{self.err}-{max_retry_time}】,get_data:获取数据超时,错误类型:{type(e).__name__}, 错误详细:{str(e).split("Stacktrace:")[0]}'
+ )
+
+ # 保存错误截图
+ if enable_screenshot:
+ self.driver.save_screenshot(
+ f'screenshot/screenshot_{time.strftime("%Y-%m-%d_%H-%M-%S")}.png'
+ )
+
+ # 删除本次请求
+ del self.driver.requests
+
+ if self.err < max_retry_time:
+ # 刷新页面
+ print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} get_data:刷新页面')
+ self.refresh_driver()
+
+ # 检查注意事项和验证码
+ if self.check_verification_code():
+ # 重试
+ self.get_data()
+
+ # 判断错误次数
+ if self.err >= max_retry_time:
+ print(
+ f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 误次数【{self.err}-{max_retry_time}】,get_data:重新尝试加载页面,这次指定需要重定向到首页'
+ )
+
+ # 重置错误计数
+ self.err = 0
+ # 重新尝试加载页面,这次指定需要重定向到首页
+ self.get_page(1)
+ else:
+ # 删除本次请求
+ del self.driver.requests
+
+ # 检查数据获取正确性
+ if (
+ rb["departureCityName"] == self.city[0]
+ and rb["arrivalCityName"] == self.city[1]
+ and rb["departureDate"] == self.date
+ ):
+ print(f"get_data:城市匹配成功:出发地-{self.city[0]},目的地-{self.city[1]}")
+
+ # 重置错误计数
+ self.err = 0
+
+ # 若无错误,执行下一步
+ self.decode_data()
+ else:
+ print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 错误次数【{self.err}-{max_retry_time}】,get_data:刷新页面')
+ # 错误次数+1
+ self.err += 1
+
+ # 保存错误截图
+ if enable_screenshot:
+ self.driver.save_screenshot(
+ f'screenshot/screenshot_{time.strftime("%Y-%m-%d_%H-%M-%S")}.png'
+ )
+
+ # 重新更换城
+ print(
+ f'{time.strftime("%Y-%m-%d_%H-%M-%S")} get_data:重新更换城市:{rb["departureCityName"]}-{rb["arrivalCityName"]}-{rb["departureDate"]}'
+ )
+
+ # 检查注意事项和验证码
+ if self.check_verification_code():
+ # 重试
+ self.change_city()
+
+ def decode_data(self):
+ try:
+ # 使用python-magic库检查MIME类型
+ mime = magic.Magic()
+ file_type = mime.from_buffer(self.predata.response.body)
+
+ buf = io.BytesIO(self.predata.response.body)
+
+ if "gzip" in file_type:
+ gf = gzip.GzipFile(fileobj=buf)
+ self.dedata = gf.read().decode("UTF-8")
+ elif "JSON data" in file_type:
+ print(buf.read().decode("UTF-8"))
+ else:
+ print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 未知的压缩格式:{file_type}')
+
+ self.dedata = json.loads(self.dedata)
+
+ except Exception as e:
+ # 错误次数+1
+ self.err += 1
+
+ print(
+ f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 错误次数【{self.err}-{max_retry_time}】,decode_data:数据解码失败,错误类型:{type(e).__name__}, 错误详细:{str(e).split("Stacktrace:")[0]}'
+ )
+
+ # 保存错误截图
+ if enable_screenshot:
+ self.driver.save_screenshot(
+ f'screenshot/screenshot_{time.strftime("%Y-%m-%d_%H-%M-%S")}.png'
+ )
+
+ # 删除本次请求
+ del self.driver.requests
+
+ if self.err < max_retry_time:
+ # 刷新页面
+ print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} decode_data:刷新页面')
+ self.refresh_driver()
+
+ # 检查注意事项和验证码
+ if self.check_verification_code():
+ # 试
+ self.get_data()
+ # 判错误次数
+ if self.err >= max_retry_time:
+ print(
+ f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 错误次数【{self.err}-{max_retry_time}】,decode_data:重新尝试加载页面,这次指定需要重定向到首页'
+ )
+
+ # 重置错误计数
+ self.err = 0
+
+ # 重新尝试加载页面,这次指定需要重定向到首页
+ self.get_page(1)
+ else:
+ # 重置错误计数
+ self.err = 0
+
+ # 若无误,执行下一步
+ self.check_data()
+
+ def check_data(self):
+ try:
+ self.flightItineraryList = self.dedata["data"]["flightItineraryList"]
+ # 倒序遍历,删除转机航班
+ for i in range(len(self.flightItineraryList) - 1, -1, -1):
+ if (
+ self.flightItineraryList[i]["flightSegments"][0]["transferCount"]
+ != 0
+ ):
+ self.flightItineraryList.pop(i)
+ if len(self.flightItineraryList) == 0 and direct_flight:
+ print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 不存在直航航班:{self.city[0]}-{self.city[1]}')
+ # 重置错误计数
+ self.err = 0
+ return 0
+ except Exception as e:
+ # 错误次数+1
+ self.err += 1
+ print(
+ f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 数据检查出错:不存在航班,错误类型:{type(e).__name__}, 错误详细:{str(e).split("Stacktrace:")[0]}'
+ )
+ print(self.dedata)
+ if self.err < max_retry_time:
+ if 'searchErrorInfo' in self.dedata["data"]:
+ # 重置错误计数
+ self.err = 0
+ return 0
+ else:
+ if "'needUserLogin': True" in str(self.dedata["data"]):
+ print(
+ f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 错误次数【{self.err}-{max_retry_time}】,check_data:必须要登录才能查看数据,这次指定需要重定向到首页'
+ )
+ # 重新尝试加载页面,这次指定需要重定向到首页
+ self.login()
+
+ # 刷新页面
+ print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} check_data:刷新页面')
+ self.refresh_driver()
+ # 检查注意事项和验证码
+ if self.check_verification_code():
+ # 重试
+ self.get_data()
+ # 判断错误次数
+ if self.err >= max_retry_time:
+ print(
+ f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 错误次数【{self.err}-{max_retry_time}】,check_data:重新尝试加载页面,这次指定需要重定向到首页'
+ )
+
+ # 重置错误计数
+ self.err = 0
+
+ # 重新尝试加载页面,这次指定需要重定向到首页
+ self.get_page(1)
+ else:
+ # 重置错误计数
+ self.err = 0
+ self.proc_flightSegments()
+ self.proc_priceList()
+ self.mergedata()
+
+ def proc_flightSegments(self):
+ self.flights = pd.DataFrame()
+
+ for flightlist in self.flightItineraryList:
+ flightlist = flightlist["flightSegments"][0]["flightList"]
+ flightUnitList = dict(flightlist[0])
+
+ departureday = flightUnitList["departureDateTime"].split(" ")[0]
+ departuretime = flightUnitList["departureDateTime"].split(" ")[1]
+
+ arrivalday = flightUnitList["arrivalDateTime"].split(" ")[0]
+ arrivaltime = flightUnitList["arrivalDateTime"].split(" ")[1]
+
+ # 处理 stopList
+ if 'stopList' in flightUnitList and flightUnitList['stopList']:
+ stop_info = []
+ for stop in flightUnitList['stopList']:
+ stop_info.append(f"{stop['cityName']}({stop['airportName']}, {stop['duration']}分钟)")
+ flightUnitList['stopInfo'] = ' -> '.join(stop_info)
+ else:
+ flightUnitList['stopInfo'] = '无中转'
+
+ if del_info:
+ # 删除一些不重要的信息
+ dellist = [
+ "sequenceNo",
+ "marketAirlineCode",
+ "departureProvinceId",
+ "departureCityId",
+ "departureCityCode",
+ "departureAirportShortName",
+ "departureTerminal",
+ "arrivalProvinceId",
+ "arrivalCityId",
+ "arrivalCityCode",
+ "arrivalAirportShortName",
+ "arrivalTerminal",
+ "transferDuration",
+ "stopList", # 删除原始的 stopList
+ "leakedVisaTagSwitch",
+ "trafficType",
+ "highLightPlaneNo",
+ "mealType",
+ "operateAirlineCode",
+ "arrivalDateTime",
+ "departureDateTime",
+ "operateFlightNo",
+ "operateAirlineName",
+ ]
+ for value in dellist:
+ flightUnitList.pop(value, None)
+
+ # 更新日期格式
+ flightUnitList.update(
+ {
+ "departureday": departureday,
+ "departuretime": departuretime,
+ "arrivalday": arrivalday,
+ "arrivaltime": arrivaltime,
+ }
+ )
+
+ self.flights = pd.concat(
+ [
+ self.flights,
+ pd.DataFrame.from_dict(flightUnitList, orient="index").T,
+ ],
+ ignore_index=True,
+ )
+
+ def proc_priceList(self):
+ self.prices = pd.DataFrame()
+
+ for flightlist in self.flightItineraryList:
+ flightNo = flightlist["itineraryId"].split("_")[0]
+ priceList = flightlist["priceList"]
+
+ # 经济舱,经济舱折扣
+ economy, economy_tax, economy_total, economy_full = [], [], [], []
+ economy_origin_price, economy_tax_price, economy_total_price, economy_full_price = "", "", "", ""
+ # 商务舱,商务舱折扣
+ bussiness, bussiness_tax, bussiness_total, bussiness_full = [], [], [], []
+ bussiness_origin_price, bussiness_tax_price, bussiness_total_price, bussiness_full_price = "", "", "", ""
+
+ for price in priceList:
+ # print("Price dictionary keys:", price.keys())
+ # print("Full price dictionary:", json.dumps(price, indent=2))
+
+ adultPrice = price["adultPrice"]
+ childPrice = price.get("childPrice", adultPrice) # 如果没有childPrice,使用adultPrice
+ freeOilFeeAndTax = price["freeOilFeeAndTax"]
+ sortPrice = price["sortPrice"]
+
+ # 估算税费(如果需要的话)
+ estimatedTax = sortPrice - adultPrice if not freeOilFeeAndTax else 0
+
+ miseryIndex = price["miseryIndex"]
+ cabin = price["cabin"]
+
+ # 经济舱
+ if cabin == "Y":
+ economy.append(adultPrice)
+ economy_tax.append(estimatedTax)
+ economy_full.append(miseryIndex)
+ economy_total.append(adultPrice + estimatedTax)
+ # 商务舱
+ elif cabin == "C":
+ bussiness.append(adultPrice)
+ bussiness_tax.append(estimatedTax)
+ bussiness_full.append(miseryIndex)
+ bussiness_total.append(adultPrice + estimatedTax)
+
+ # 初始化变量
+ economy_min_index = None
+ bussiness_min_index = None
+
+ if economy_total != []:
+ economy_total_price = min(economy_total)
+ economy_min_index = economy_total.index(economy_total_price)
+
+ if bussiness_total != []:
+ bussiness_total_price = min(bussiness_total)
+ bussiness_min_index = bussiness_total.index(bussiness_total_price)
+
+ if economy_min_index is not None:
+ economy_origin_price = economy[economy_min_index]
+ economy_tax_price = economy_tax[economy_min_index]
+ economy_full_price = economy_full[economy_min_index]
+
+ if bussiness_min_index is not None:
+ bussiness_origin_price = bussiness[bussiness_min_index]
+ bussiness_tax_price = bussiness_tax[bussiness_min_index]
+ bussiness_full_price = bussiness_full[bussiness_min_index]
+
+ price_info = {
+ "flightNo": flightNo,
+ "economy_origin": economy_origin_price,
+ "economy_tax": economy_tax_price,
+ "economy_total": economy_total_price,
+ "economy_full": economy_full_price,
+ "bussiness_origin": bussiness_origin_price,
+ "bussiness_tax": bussiness_tax_price,
+ "bussiness_total": bussiness_total_price,
+ "bussiness_full": bussiness_full_price,
+ }
+
+ # self.prices=self.prices.append(price_info,ignore_index=True)
+ self.prices = pd.concat(
+ [self.prices, pd.DataFrame(price_info, index=[0])], ignore_index=True
+ )
+
+ ##
+ def mergedata(self):
+ try:
+ self.df = self.flights.merge(self.prices, on=["flightNo"])
+ print(f"合并后的航班数据形状: {self.df.shape}")
+ print(f"合并后的航班数据列: {self.df.columns}")
+
+ self.df["dateGetTime"] = dt.now().strftime("%Y-%m-%d")
+
+ print(f"获取到的舒适度数据: {self.comfort_data}")
+
+ if self.comfort_data:
+ comfort_df = pd.DataFrame.from_dict(self.comfort_data, orient='index')
+ comfort_df.reset_index(inplace=True)
+ comfort_df.rename(columns={'index': 'flight_no'}, inplace=True)
+
+ print(f"舒适度数据形状: {comfort_df.shape}")
+ print(f"舒适度数据列: {comfort_df.columns}")
+ print(f"舒适度数据前几行: \n{comfort_df.head()}")
+
+ # 检查 operateFlightNo 列是否存在
+ if 'operateFlightNo' in self.df.columns:
+ print(f"合并前的 operateFlightNo 唯一值: {self.df['operateFlightNo'].unique()}")
+ # 创建一个临时列来存储用于匹配的航班号
+ self.df['match_flight_no'] = self.df['operateFlightNo'].fillna(self.df['flightNo'])
+ else:
+ print("警告: operateFlightNo 列不存在于数据中,将使用 flightNo 进行匹配")
+ self.df['match_flight_no'] = self.df['flightNo']
+
+ print(f"现有的列: {self.df.columns}")
+ print(f"合并前的 flight_no 唯一值: {comfort_df['flight_no'].unique()}")
+
+ # 使用 left join 来合并数据
+ self.df = self.df.merge(comfort_df, left_on='match_flight_no', right_on='flight_no', how='left')
+
+ print(f"合并后的数据形状: {self.df.shape}")
+ print(f"合并后的数据列: {self.df.columns}")
+
+ # 删除临时列和多余的flight_no列
+ self.df.drop(['match_flight_no', 'flight_no'], axis=1, inplace=True, errors='ignore')
+
+ if rename_col:
+ # 对pandas的columns进行重命名
+ order = [
+ "数据获取日期",
+ "航班号",
+ "航空公司",
+ "出发日期",
+ "出发时间",
+ "到达日期",
+ "到达时间",
+ "飞行时长",
+ "出发国家",
+ "出发城市",
+ "出发机场",
+ "出发机场三字码",
+ "到达国家",
+ "到达城市",
+ "到达机场",
+ "到达机场三字码",
+ "飞机型号",
+ "飞机尺寸",
+ "飞机型号三字码",
+ "到达准点率",
+ "停留次数",
+ "中转信息", # 新增字段
+ ]
+
+ origin = [
+ "dateGetTime",
+ "flightNo",
+ "marketAirlineName",
+ "departureday",
+ "departuretime",
+ "arrivalday",
+ "arrivaltime",
+ "duration",
+ "departureCountryName",
+ "departureCityName",
+ "departureAirportName",
+ "departureAirportCode",
+ "arrivalCountryName",
+ "arrivalCityName",
+ "arrivalAirportName",
+ "arrivalAirportCode",
+ "aircraftName",
+ "aircraftSize",
+ "aircraftCode",
+ "arrivalPunctuality",
+ "stopCount",
+ "stopInfo", # 新增字段
+ ]
+
+ columns = dict(zip(origin, order))
+
+ # 添加舒适度数据的列名映射
+ comfort_columns = {
+ 'departure_delay_time': '出发延误时间',
+ 'departure_bridge_rate': '出发廊桥率',
+ 'arrival_delay_time': '到达延误时间',
+ 'plane_type': '飞机类型',
+ 'plane_width': '飞机宽度',
+ 'plane_age': '飞机机龄',
+ 'Y_has_meal': '经济舱是否有餐食',
+ 'Y_seat_tilt': '经济舱座椅倾斜度',
+ 'Y_seat_width': '经济舱座椅宽度',
+ 'Y_seat_pitch': '经济舱座椅间距',
+ 'Y_meal_msg': '经济舱餐食信息',
+ 'Y_power': '经济舱电源',
+ 'C_has_meal': '商务舱是否有餐食',
+ 'C_seat_tilt': '商务舱座椅倾斜度',
+ 'C_seat_width': '商务舱座椅宽度',
+ 'C_seat_pitch': '商务舱座椅间距',
+ 'C_meal_msg': '商务舱餐食信息',
+ 'C_power': '商务舱电源',
+ }
+ columns.update(comfort_columns)
+
+ self.df = self.df.rename(columns=columns)
+
+ if del_info:
+ self.df = self.df[order + list(comfort_columns.values())]
+
+ files_dir = os.path.join(
+ os.getcwd(), self.date, dt.now().strftime("%Y-%m-%d")
+ )
+
+ if not os.path.exists(files_dir):
+ os.makedirs(files_dir)
+
+ filename = os.path.join(
+ files_dir, f"{self.city[0]}-{self.city[1]}.csv")
+
+ self.df.to_csv(filename, encoding="UTF-8", index=False) #保存为CSV文件
+
+ print(f'\n{time.strftime("%Y-%m-%d_%H-%M-%S")} 数据爬取完成 {filename}\n') #输出爬取成功信息
+
+ return 0
+#捕获在合并数据发生的异常
+ except Exception as e:
+ print(f"合并数据失败 {str(e)}")
+ print(f"错误类型: {type(e).__name__}")
+ print(f"错误详情: {str(e)}")
+ import traceback
+ print(f"错误堆栈: {traceback.format_exc()}")
+ return 0
+
+ ##爬取航班舒适度信息
+ def capture_flight_comfort_data(self):
+ try:
+ # 滚动页面到底部以加载所有内容
+ last_height = self.driver.execute_script("return document.body.scrollHeight")
+ while True:
+ # 分步滚动页面
+ for i in range(10): # 将页面分成10步滚动
+ scroll_height = last_height * (i + 1) / 3
+ self.driver.execute_script(f"window.scrollTo(0, {scroll_height});")
+ time.sleep(0.5) # 每一小步等待0.5秒
+
+ # 等待页面加载
+ time.sleep(3) # 滚动到底部后多等待3秒
+
+ # 计算新的滚动高度并与最后的滚动高度进行比较
+ new_height = self.driver.execute_script("return document.body.scrollHeight")
+ if new_height == last_height:
+ break
+ last_height = new_height
+
+ comfort_requests = self.driver.requests#获取页面加载过程中发出的所有网络请求
+ #初始化一些变量用于统计和标记相关请求
+ comfort_data = {}
+ batch_comfort_found = False
+ getFlightComfort_requests_count = 0
+ total_requests_count = len(comfort_requests)
+
+ print(f"\n{time.strftime('%Y-%m-%d_%H-%M-%S')} 开始分析请求,总请求数:{total_requests_count}")
+ #遍历所有网络请求
+ for request in comfort_requests:
+ if "/search/api/flight/comfort/batchGetComfortTagList" in request.url:
+ batch_comfort_found = True
+ print(f"{time.strftime('%Y-%m-%d_%H-%M-%S')} 找到 batchGetComfortTagList 请求")
+ continue#如果请求 URL 包含 "/search/api/flight/comfort/batchGetComfortTagList",标记 batch_comfort_found 为 True 并继续下一个请求
+
+ if "/search/api/flight/comfort/getFlightComfort" in request.url:
+ getFlightComfort_requests_count += 1
+ print(
+ f"\n{time.strftime('%Y-%m-%d_%H-%M-%S')} 捕获到第 {getFlightComfort_requests_count} 个 getFlightComfort 请求:")
+ print(f"URL: {request.url}")
+ #如果请求 URL 包含 "/search/api/flight/comfort/getFlightComfort":统计此类请求数量并打印请求信息
+
+ #解析请求体(payload)获取航班号,如果解析失败打印错误信息并继续下一个请求。
+ try:
+ payload = json.loads(request.body.decode('utf-8'))
+ flight_no = payload.get('flightNoList', ['Unknown'])[0]
+ print(f"请求的航班号: {flight_no}")
+ except Exception as e:
+ print(f"无法解析请求 payload: {str(e)}")
+ continue
+ #如果请求有响应
+ if request.response:
+ print(f"响应状态码: {request.response.status_code}")#打印状态码
+ body = request.response.body
+ if request.response.headers.get('Content-Encoding', '').lower() == 'gzip':
+ body = gzip.decompress(body)#解压gzip
+
+ try:
+ json_data = json.loads(body.decode('utf-8')) #解析响应体为 JSON 格式
+ #解析成功且响应状态正常,提取舒适度数据
+ print(
+ f"响应数据: {json.dumps(json_data, indent=2, ensure_ascii=False)[:500]}...") # 打印前500个字符
+ if json_data['status'] == 0 and json_data['msg'] == 'success':
+ flight_comfort = json_data['data']
+
+ punctuality = flight_comfort['punctualityInfo']
+ plane_info = flight_comfort['planeInfo']
+ cabin_info = {cabin['cabin']: cabin for cabin in flight_comfort['cabinInfoList']}
+ #整理到 processed_data 字典
+ processed_data = {
+ 'departure_delay_time': punctuality['departureDelaytime'],
+ 'departure_bridge_rate': punctuality['departureBridge'],
+ 'arrival_delay_time': punctuality['arrivalDelaytime'],
+ 'plane_type': plane_info['planeTypeName'],
+ 'plane_width': plane_info['planeWidthCategory'],
+ 'plane_age': plane_info['planeAge']
+ }
+ #客舱类型标识
+ for cabin_type in ['Y', 'C']:
+ if cabin_type in cabin_info:
+ cabin = cabin_info[cabin_type]
+ processed_data.update({
+ f'{cabin_type}_has_meal': cabin['hasMeal'],
+ f'{cabin_type}_seat_tilt': cabin['seatTilt']['value'],
+ f'{cabin_type}_seat_width': cabin['seatWidth']['value'],
+ f'{cabin_type}_seat_pitch': cabin['seatPitch']['value'],
+ f'{cabin_type}_meal_msg': cabin['mealMsg']
+ })
+ if 'power' in cabin:
+ processed_data[f'{cabin_type}_power'] = cabin['power']
+
+ #添加到 comfort_data 字典中,同时打印成功提取信息
+ comfort_data[flight_no] = processed_data
+ print(f"{time.strftime('%Y-%m-%d_%H-%M-%S')} 成功提取航班 {flight_no} 的舒适度数据")
+ else:
+ print(
+ f"{time.strftime('%Y-%m-%d_%H-%M-%S')} getFlightComfort 响应状态异常: {json_data['status']}, {json_data['msg']}")
+ except Exception as e:
+ print(f"{time.strftime('%Y-%m-%d_%H-%M-%S')} 处理 getFlightComfort 响应时出错: {str(e)}")
+ else:
+ print(f"{time.strftime('%Y-%m-%d_%H-%M-%S')} getFlightComfort 请求没有响应")
+
+ print(f"\n{time.strftime('%Y-%m-%d_%H-%M-%S')} 请求分析完成")
+ print(f"总请求数: {total_requests_count}")
+ print(f"batchGetComfortTagList 请求是否找到: {batch_comfort_found}")
+ print(f"getFlightComfort 请求数: {getFlightComfort_requests_count}")
+ print(f"成功提取的舒适度数据数: {len(comfort_data)}")
+
+ if comfort_data:
+ # 创建舒适度DataFrame
+ comfort_df = pd.DataFrame.from_dict(comfort_data, orient='index')
+ comfort_df.reset_index(inplace=True)
+ comfort_df.rename(columns={'index': 'flight_no'}, inplace=True)
+
+ # 保存舒适度数据为CSV文件
+ # save_dir = os.path.join(os.getcwd(), self.date, datetime.now().strftime("%Y-%m-%d"))
+ # os.makedirs(save_dir, exist_ok=True)
+
+ # comfort_filename = os.path.join(save_dir, f"{self.city[0]}-{self.city[1]}_comfort.csv")
+ # comfort_df.to_csv(comfort_filename, encoding="UTF-8", index=False)
+ # print(f"{time.strftime('%Y-%m-%d_%H-%M-%S')} 航班舒适度数据已保存到 {comfort_filename}")
+
+ return comfort_data
+ #没有提取到数据,打印未捕获到数据的提示和可能的原因
+ else:
+ print(f"{time.strftime('%Y-%m-%d_%H-%M-%S')} 未捕获到任何 getFlightComfort 数据")
+ print("可能的原因:")
+ print("1. 网页没有加载完全")
+ print("2. 网站结构可能已经改变")
+ print("3. 网络连接问题")
+ print("4. 请求被网站拦截或限制")
+ return None
+ #捕获到异常,打印异常发生的时间、错误信息、错误类型、错误详情和详细的错误堆栈跟踪,最后返回 None。
+ except Exception as e:
+ print(f"{time.strftime('%Y-%m-%d_%H-%M-%S')} 捕获 getFlightComfort 数据时出错:{str(e)}")
+ print(f"错误类型: {type(e).__name__}")
+ print(f"错误详情: {str(e)}")
+ import traceback
+ print(f"错误堆栈: {traceback.format_exc()}")
+ return None
+
+
+if __name__ == "__main__":
+
+ driver = init_driver()
+
+ citys = gen_citys(crawal_citys)
+
+ flight_dates = generate_flight_dates(crawal_days, begin_date, end_date, start_interval, days_interval)
+
+ Flight_DataFetcher = DataFetcher(driver)
+
+ for city in citys:
+ Flight_DataFetcher.city = city
+
+ for flight_date in flight_dates:
+ Flight_DataFetcher.date = flight_date
+
+ if os.path.exists(
+ os.path.join(os.getcwd(), flight_date, dt.now().strftime("%Y-%m-%d"), f"{city[0]}-{city[1]}.csv")):
+ print(
+ f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 文件已存在:{os.path.join(os.getcwd(), flight_date, dt.now().strftime("%Y-%m-%d"), f"{city[0]}-{city[1]}.csv")}')
+ continue
+ elif ('http' not in Flight_DataFetcher.driver.current_url):
+ print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 当前的URL是:{driver.current_url}')
+ # 初始化页面
+ Flight_DataFetcher.get_page(1)
+
+ else:
+ # 后续运行只需更换出发与目的地
+ Flight_DataFetcher.change_city()
+
+ time.sleep(crawal_interval)
+
+ # 运行结束退出
+ try:
+ driver = Flight_DataFetcher.driver
+ driver.quit()
+ except Exception as e:
+ print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} An error occurred while quitting the driver: {e}')
+
+ print(f'\n{time.strftime("%Y-%m-%d_%H-%M-%S")} 程序运行完成!!!!')
diff --git a/src/clawer/db_import.py b/src/clawer/db_import.py
new file mode 100644
index 0000000..774d8ed
--- /dev/null
+++ b/src/clawer/db_import.py
@@ -0,0 +1,90 @@
+import pandas as pd
+import mysql.connector
+from mysql.connector import Error
+import os
+from datetime import datetime, timedelta
+
+# 数据库连接配置
+db_config = {
+ 'host': 'localhost', # 修改这里,去掉端口号
+ 'port': 3307, # 单独指定端口号
+ 'database': 'fly_ticket',
+ 'user': 'root',
+ 'password': '123456'
+}
+
+def import_csv_to_db(file_path, cursor):
+ df = pd.read_csv(file_path)
+ for index, row in df.iterrows():
+ sql = """INSERT INTO flight (f_n, f_s_p, f_a_p, f_s_a, f_a_a, f_s_t, f_a_t, f_Date, f_Delay, f_p, f_food, f_wide, f_depcode, f_dstcode)
+ VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
+ ON DUPLICATE KEY UPDATE
+ f_s_p = VALUES(f_s_p),
+ f_a_p = VALUES(f_a_p),
+ f_s_a = VALUES(f_s_a),
+ f_a_a = VALUES(f_a_a),
+ f_s_t = VALUES(f_s_t),
+ f_a_t = VALUES(f_a_t),
+ f_Delay = VALUES(f_Delay),
+ f_p = VALUES(f_p),
+ f_food = VALUES(f_food),
+ f_wide = VALUES(f_wide),
+ f_depcode = VALUES(f_depcode),
+ f_dstcode = VALUES(f_dstcode);"""
+
+ values = (
+ row['航班号'],
+ row['出发城市'],
+ row['到达城市'],
+ row['出发机场'],
+ row['到达机场'],
+ row['出发时间'],
+ row['到达时间'],
+ row['出发日期'],
+ row['出发延误时间'],
+ row['economy_origin'],
+ row['经济舱餐食信息'],
+ row['经济舱座椅间距'],
+ row['出发机场三字码'],
+ row['到达机场三字码']
+ )
+
+ cursor.execute(sql, values)
+
+try:
+ # 连接到数据库
+ conn = mysql.connector.connect(**db_config)
+
+ if conn.is_connected():
+ cursor = conn.cursor()
+
+ # 设置日期范围
+ start_date = datetime(2024, 10, 22)
+ end_date = datetime(2024, 11, 1)
+ current_date = start_date
+
+ while current_date <= end_date:
+ folder_name = current_date.strftime("%Y-%m-%d")
+ folder_path = os.path.join("D:\\college\\SE2\\Ctrip-Crawler-main\\Ctrip-Crawler-main", folder_name, "2024-10-22")
+
+ if os.path.exists(folder_path):
+ for file_name in os.listdir(folder_path):
+ if file_name.endswith('.csv'):
+ file_path = os.path.join(folder_path, file_name)
+ import_csv_to_db(file_path, cursor)
+ print(f"已导入文件: {file_path}")
+
+ current_date += timedelta(days=1)
+
+ # 提交更改
+ conn.commit()
+ print("所有数据成功插入到数据库")
+
+except Error as e:
+ print(f"连接数据库时出错: {e}")
+
+finally:
+ if 'conn' in locals() and conn.is_connected():
+ cursor.close()
+ conn.close()
+ print("数据库连接已关闭")