|
|
|
@ -16,13 +16,13 @@ from datetime import datetime
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 爬取的城市
|
|
|
|
|
crawal_citys = ["天津", "贵阳"]
|
|
|
|
|
crawal_citys = ["天津", "泉州"]
|
|
|
|
|
|
|
|
|
|
# 爬取日期范围:起始日期。格式'2023-12-01'
|
|
|
|
|
begin_date = "2024-10-21"
|
|
|
|
|
begin_date = "2024-10-22"
|
|
|
|
|
|
|
|
|
|
# 爬取日期范围:结束日期。格式'2023-12-31'
|
|
|
|
|
end_date = "2024-10-22"
|
|
|
|
|
end_date = "2024-11-01"
|
|
|
|
|
|
|
|
|
|
# 爬取T+N,即N天后
|
|
|
|
|
start_interval = 1
|
|
|
|
@ -79,8 +79,7 @@ def download_stealth_js(file_path, url='https://raw.githubusercontent.com/requir
|
|
|
|
|
print(f"{file_path} already exists, no need to download.")
|
|
|
|
|
|
|
|
|
|
def init_driver():
|
|
|
|
|
# options = webdriver.ChromeOptions() # 创建一个配置对象
|
|
|
|
|
options = webdriver.EdgeOptions() # 创建一个配置对象
|
|
|
|
|
options = webdriver.ChromeOptions() # 改为ChromeOptions
|
|
|
|
|
options.add_argument("--incognito") # 隐身模式(无痕模式)
|
|
|
|
|
# options.add_argument('--headless') # 启用无头模式
|
|
|
|
|
options.add_argument("--no-sandbox")
|
|
|
|
@ -96,10 +95,11 @@ def init_driver():
|
|
|
|
|
options.add_argument("--ignore-certificate-errors-spki-list")
|
|
|
|
|
options.add_argument("--ignore-ssl-errors")
|
|
|
|
|
options.add_experimental_option("excludeSwitches", ["enable-automation"]) # 不显示正在受自动化软件控制的提示
|
|
|
|
|
# chromeDriverPath = 'C:/Program Files/Google/Chrome/Application/chromedriver' #chromedriver位置
|
|
|
|
|
# options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.69")
|
|
|
|
|
# driver = webdriver.Chrome(executable_path=self.chromeDriverPath,chrome_options=self.options)
|
|
|
|
|
driver = webdriver.Edge(options=options)
|
|
|
|
|
|
|
|
|
|
# 如果需要指定Chrome驱动的路径,取消下面这行的注释并设置正确的路径
|
|
|
|
|
# chromedriver_path = '/path/to/chromedriver'
|
|
|
|
|
|
|
|
|
|
driver = webdriver.Chrome(options=options) # 改为Chrome,如果需要指定路径,可以加上executable_path参数
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
download_stealth_js(stealth_js_path)
|
|
|
|
@ -316,7 +316,7 @@ class DataFetcher(object):
|
|
|
|
|
|
|
|
|
|
if self.err < max_retry_time:
|
|
|
|
|
# 刷新页面
|
|
|
|
|
print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} login:刷新页')
|
|
|
|
|
print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} login:刷新页面')
|
|
|
|
|
self.refresh_driver()
|
|
|
|
|
# 检查注意事项和验证码
|
|
|
|
|
if self.check_verification_code():
|
|
|
|
@ -332,16 +332,27 @@ class DataFetcher(object):
|
|
|
|
|
next_stage_flag = False
|
|
|
|
|
try:
|
|
|
|
|
if reset_to_homepage == 1:
|
|
|
|
|
print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 尝试前往首页...')
|
|
|
|
|
start_time = time.time()
|
|
|
|
|
# 前往首页
|
|
|
|
|
self.driver.get(
|
|
|
|
|
"https://flights.ctrip.com/online/channel/domestic")
|
|
|
|
|
end_time = time.time()
|
|
|
|
|
print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 前往首页耗时: {end_time - start_time:.2f} 秒')
|
|
|
|
|
|
|
|
|
|
print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 当前页面 URL: {self.driver.current_url}')
|
|
|
|
|
print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 当前页面标题: {self.driver.title}')
|
|
|
|
|
|
|
|
|
|
# 检查注意事项和验证码
|
|
|
|
|
if self.check_verification_code():
|
|
|
|
|
print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 等待页面加载完成...')
|
|
|
|
|
WebDriverWait(self.driver, max_wait_time).until(
|
|
|
|
|
EC.presence_of_element_located(
|
|
|
|
|
(By.CLASS_NAME, "pc_home-jipiao"))
|
|
|
|
|
)
|
|
|
|
|
print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 页面加载完成')
|
|
|
|
|
|
|
|
|
|
print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 尝试点击飞机图标...')
|
|
|
|
|
# 点击飞机图标,返回主界面
|
|
|
|
|
ele = WebDriverWait(self.driver, max_wait_time).until(
|
|
|
|
|
element_to_be_clickable(
|
|
|
|
@ -350,7 +361,9 @@ class DataFetcher(object):
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
ele.click()
|
|
|
|
|
print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 成功点击飞机图标')
|
|
|
|
|
|
|
|
|
|
print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 尝试选择单程...')
|
|
|
|
|
# 单程
|
|
|
|
|
ele = WebDriverWait(self.driver, max_wait_time).until(
|
|
|
|
|
element_to_be_clickable(
|
|
|
|
@ -359,7 +372,9 @@ class DataFetcher(object):
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
ele.click()
|
|
|
|
|
print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 成功选择单程')
|
|
|
|
|
|
|
|
|
|
print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 尝试点击搜索按钮...')
|
|
|
|
|
# 搜索
|
|
|
|
|
ele = WebDriverWait(self.driver, max_wait_time).until(
|
|
|
|
|
element_to_be_clickable(
|
|
|
|
@ -367,6 +382,7 @@ class DataFetcher(object):
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
ele.click()
|
|
|
|
|
print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 成功点击搜索按钮')
|
|
|
|
|
|
|
|
|
|
next_stage_flag = True
|
|
|
|
|
except Exception as e:
|
|
|
|
@ -374,20 +390,26 @@ class DataFetcher(object):
|
|
|
|
|
print(
|
|
|
|
|
f'{time.strftime("%Y-%m-%d_%H-%M-%S")} get_page:页面加载或元素操作失败,错误类型:{type(e).__name__}, 详细错误信息:{str(e).split("Stacktrace:")[0]}'
|
|
|
|
|
)
|
|
|
|
|
print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 当前页面 URL: {self.driver.current_url}')
|
|
|
|
|
print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 当前页面标题: {self.driver.title}')
|
|
|
|
|
print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 当前页面源代码: {self.driver.page_source[:500]}...') # 只打印前500个字符
|
|
|
|
|
|
|
|
|
|
# 保存错误截图
|
|
|
|
|
if enable_screenshot:
|
|
|
|
|
self.driver.save_screenshot(
|
|
|
|
|
f'screenshot/screenshot_{time.strftime("%Y-%m-%d_%H-%M-%S")}.png'
|
|
|
|
|
)
|
|
|
|
|
screenshot_path = f'screenshot/screenshot_{time.strftime("%Y-%m-%d_%H-%M-%S")}.png'
|
|
|
|
|
self.driver.save_screenshot(screenshot_path)
|
|
|
|
|
print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 错误截图已保存: {screenshot_path}')
|
|
|
|
|
|
|
|
|
|
# 重新尝试加载页面,这指定需要重向到首页
|
|
|
|
|
# 重新尝试加载页面,这次指定需要重定向到首页
|
|
|
|
|
print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 重新尝试加载页面,这次指定需要重定向到首页')
|
|
|
|
|
self.get_page(1)
|
|
|
|
|
else:
|
|
|
|
|
if next_stage_flag:
|
|
|
|
|
# 继续下一步
|
|
|
|
|
print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 页面加载成功,继续下一步')
|
|
|
|
|
self.change_city()
|
|
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 页面加载成功,但未能完成所有操作')
|
|
|
|
|
def change_city(self):
|
|
|
|
|
next_stage_flag = False
|
|
|
|
|
try:
|
|
|
|
@ -733,14 +755,13 @@ class DataFetcher(object):
|
|
|
|
|
# 判断错误次数
|
|
|
|
|
if self.err >= max_retry_time:
|
|
|
|
|
print(
|
|
|
|
|
f'{time.strftime("%Y-%m-%d_%H-%M-%S")} <EFBFBD><EFBFBD><EFBFBD>误次数【{self.err}-{max_retry_time}】,get_data:重新尝试加载页面,这次指定需要重定向到首页'
|
|
|
|
|
f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 误次数【{self.err}-{max_retry_time}】,get_data:重新尝试加载页面,这次指定需要重定向到首页'
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# 重置错误计数
|
|
|
|
|
self.err = 0
|
|
|
|
|
# 重新尝试加载页面,这次指定需要重定向到首页
|
|
|
|
|
self.get_page(1)
|
|
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
# 删除本次请求
|
|
|
|
|
del self.driver.requests
|
|
|
|
@ -1074,15 +1095,15 @@ class DataFetcher(object):
|
|
|
|
|
# 检查 operateFlightNo 列是否存在
|
|
|
|
|
if 'operateFlightNo' in self.df.columns:
|
|
|
|
|
print(f"合并前的 operateFlightNo 唯一值: {self.df['operateFlightNo'].unique()}")
|
|
|
|
|
# 创建一个临时列来存储用于匹配的航班号
|
|
|
|
|
self.df['match_flight_no'] = self.df['operateFlightNo'].fillna(self.df['flightNo'])
|
|
|
|
|
else:
|
|
|
|
|
print("警告: operateFlightNo 列不存在于数据中")
|
|
|
|
|
print(f"现有的列: {self.df.columns}")
|
|
|
|
|
print("警告: operateFlightNo 列不存在于数据中,将使用 flightNo 进行匹配")
|
|
|
|
|
self.df['match_flight_no'] = self.df['flightNo']
|
|
|
|
|
|
|
|
|
|
print(f"现有的列: {self.df.columns}")
|
|
|
|
|
print(f"合并前的 flight_no 唯一值: {comfort_df['flight_no'].unique()}")
|
|
|
|
|
|
|
|
|
|
# 创建一个临时列来存储用于匹配的航班号
|
|
|
|
|
self.df['match_flight_no'] = self.df['operateFlightNo'].fillna(self.df['flightNo'])
|
|
|
|
|
|
|
|
|
|
# 使用 left join 来合并数据
|
|
|
|
|
self.df = self.df.merge(comfort_df, left_on='match_flight_no', right_on='flight_no', how='left')
|
|
|
|
|
|
|
|
|
@ -1090,7 +1111,8 @@ class DataFetcher(object):
|
|
|
|
|
print(f"合并后的数据列: {self.df.columns}")
|
|
|
|
|
|
|
|
|
|
# 删除临时列和多余的flight_no列
|
|
|
|
|
self.df.drop(['match_flight_no', 'flight_no'], axis=1, inplace=True)
|
|
|
|
|
self.df.drop(['match_flight_no', 'flight_no'], axis=1, inplace=True, errors='ignore')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if rename_col:
|
|
|
|
|
# 对pandas的columns进行重命名
|
|
|
|
@ -1305,12 +1327,12 @@ class DataFetcher(object):
|
|
|
|
|
comfort_df.rename(columns={'index': 'flight_no'}, inplace=True)
|
|
|
|
|
|
|
|
|
|
# 保存舒适度数据为CSV文件
|
|
|
|
|
save_dir = os.path.join(os.getcwd(), self.date, datetime.now().strftime("%Y-%m-%d"))
|
|
|
|
|
os.makedirs(save_dir, exist_ok=True)
|
|
|
|
|
# save_dir = os.path.join(os.getcwd(), self.date, datetime.now().strftime("%Y-%m-%d"))
|
|
|
|
|
# os.makedirs(save_dir, exist_ok=True)
|
|
|
|
|
|
|
|
|
|
comfort_filename = os.path.join(save_dir, f"{self.city[0]}-{self.city[1]}_comfort.csv")
|
|
|
|
|
comfort_df.to_csv(comfort_filename, encoding="UTF-8", index=False)
|
|
|
|
|
print(f"{time.strftime('%Y-%m-%d_%H-%M-%S')} 航班舒适度数据已保存到 {comfort_filename}")
|
|
|
|
|
# comfort_filename = os.path.join(save_dir, f"{self.city[0]}-{self.city[1]}_comfort.csv")
|
|
|
|
|
# comfort_df.to_csv(comfort_filename, encoding="UTF-8", index=False)
|
|
|
|
|
# print(f"{time.strftime('%Y-%m-%d_%H-%M-%S')} 航班舒适度数据已保存到 {comfort_filename}")
|
|
|
|
|
|
|
|
|
|
return comfort_data
|
|
|
|
|
else:
|
|
|
|
|