merge without operateFlightNo

dev-clawer
Lin 4 months ago
parent d4e76dba28
commit 1c8a00fbbb

@ -16,13 +16,13 @@ from datetime import datetime
# 爬取的城市
crawal_citys = ["天津", "贵阳"]
crawal_citys = ["天津", "泉州"]
# 爬取日期范围:起始日期。格式'2023-12-01'
begin_date = "2024-10-21"
begin_date = "2024-10-22"
# 爬取日期范围:结束日期。格式'2023-12-31'
end_date = "2024-10-22"
end_date = "2024-11-01"
# 爬取T+N即N天后
start_interval = 1
@ -79,8 +79,7 @@ def download_stealth_js(file_path, url='https://raw.githubusercontent.com/requir
print(f"{file_path} already exists, no need to download.")
def init_driver():
# options = webdriver.ChromeOptions() # 创建一个配置对象
options = webdriver.EdgeOptions() # 创建一个配置对象
options = webdriver.ChromeOptions() # 改为ChromeOptions
options.add_argument("--incognito") # 隐身模式(无痕模式)
# options.add_argument('--headless') # 启用无头模式
options.add_argument("--no-sandbox")
@ -96,10 +95,11 @@ def init_driver():
options.add_argument("--ignore-certificate-errors-spki-list")
options.add_argument("--ignore-ssl-errors")
options.add_experimental_option("excludeSwitches", ["enable-automation"]) # 不显示正在受自动化软件控制的提示
# chromeDriverPath = 'C:/Program Files/Google/Chrome/Application/chromedriver' #chromedriver位置
# options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.69")
# driver = webdriver.Chrome(executable_path=self.chromeDriverPath,chrome_options=self.options)
driver = webdriver.Edge(options=options)
# 如果需要指定Chrome驱动的路径取消下面这行的注释并设置正确的路径
# chromedriver_path = '/path/to/chromedriver'
driver = webdriver.Chrome(options=options) # 改为Chrome如果需要指定路径可以加上executable_path参数
try:
download_stealth_js(stealth_js_path)
@ -316,7 +316,7 @@ class DataFetcher(object):
if self.err < max_retry_time:
# 刷新页面
print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} login刷新页')
print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} login刷新页')
self.refresh_driver()
# 检查注意事项和验证码
if self.check_verification_code():
@ -332,16 +332,27 @@ class DataFetcher(object):
next_stage_flag = False
try:
if reset_to_homepage == 1:
print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 尝试前往首页...')
start_time = time.time()
# 前往首页
self.driver.get(
"https://flights.ctrip.com/online/channel/domestic")
end_time = time.time()
print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 前往首页耗时: {end_time - start_time:.2f}')
print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 当前页面 URL: {self.driver.current_url}')
print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 当前页面标题: {self.driver.title}')
# 检查注意事项和验证码
if self.check_verification_code():
print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 等待页面加载完成...')
WebDriverWait(self.driver, max_wait_time).until(
EC.presence_of_element_located(
(By.CLASS_NAME, "pc_home-jipiao"))
)
print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 页面加载完成')
print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 尝试点击飞机图标...')
# 点击飞机图标,返回主界面
ele = WebDriverWait(self.driver, max_wait_time).until(
element_to_be_clickable(
@ -350,7 +361,9 @@ class DataFetcher(object):
)
)
ele.click()
print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 成功点击飞机图标')
print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 尝试选择单程...')
# 单程
ele = WebDriverWait(self.driver, max_wait_time).until(
element_to_be_clickable(
@ -359,7 +372,9 @@ class DataFetcher(object):
)
)
ele.click()
print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 成功选择单程')
print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 尝试点击搜索按钮...')
# 搜索
ele = WebDriverWait(self.driver, max_wait_time).until(
element_to_be_clickable(
@ -367,6 +382,7 @@ class DataFetcher(object):
)
)
ele.click()
print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 成功点击搜索按钮')
next_stage_flag = True
except Exception as e:
@ -374,20 +390,26 @@ class DataFetcher(object):
print(
f'{time.strftime("%Y-%m-%d_%H-%M-%S")} get_page页面加载或元素操作失败错误类型{type(e).__name__}, 详细错误信息:{str(e).split("Stacktrace:")[0]}'
)
print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 当前页面 URL: {self.driver.current_url}')
print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 当前页面标题: {self.driver.title}')
print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 当前页面源代码: {self.driver.page_source[:500]}...') # 只打印前500个字符
# 保存错误截图
if enable_screenshot:
self.driver.save_screenshot(
f'screenshot/screenshot_{time.strftime("%Y-%m-%d_%H-%M-%S")}.png'
)
screenshot_path = f'screenshot/screenshot_{time.strftime("%Y-%m-%d_%H-%M-%S")}.png'
self.driver.save_screenshot(screenshot_path)
print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 错误截图已保存: {screenshot_path}')
# 重新尝试加载页面,这指定需要重向到首页
# 重新尝试加载页面,这次指定需要重定向到首页
print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 重新尝试加载页面,这次指定需要重定向到首页')
self.get_page(1)
else:
if next_stage_flag:
# 继续下一步
print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 页面加载成功,继续下一步')
self.change_city()
else:
print(f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 页面加载成功,但未能完成所有操作')
def change_city(self):
next_stage_flag = False
try:
@ -733,14 +755,13 @@ class DataFetcher(object):
# 判断错误次数
if self.err >= max_retry_time:
print(
f'{time.strftime("%Y-%m-%d_%H-%M-%S")} <EFBFBD><EFBFBD><EFBFBD>误次数【{self.err}-{max_retry_time}】,get_data:重新尝试加载页面,这次指定需要重定向到首页'
f'{time.strftime("%Y-%m-%d_%H-%M-%S")} 误次数【{self.err}-{max_retry_time}】,get_data:重新尝试加载页面,这次指定需要重定向到首页'
)
# 重置错误计数
self.err = 0
# 重新尝试加载页面,这次指定需要重定向到首页
self.get_page(1)
else:
# 删除本次请求
del self.driver.requests
@ -1074,15 +1095,15 @@ class DataFetcher(object):
# 检查 operateFlightNo 列是否存在
if 'operateFlightNo' in self.df.columns:
print(f"合并前的 operateFlightNo 唯一值: {self.df['operateFlightNo'].unique()}")
# 创建一个临时列来存储用于匹配的航班号
self.df['match_flight_no'] = self.df['operateFlightNo'].fillna(self.df['flightNo'])
else:
print("警告: operateFlightNo 列不存在于数据中")
print(f"现有的列: {self.df.columns}")
print("警告: operateFlightNo 列不存在于数据中,将使用 flightNo 进行匹配")
self.df['match_flight_no'] = self.df['flightNo']
print(f"现有的列: {self.df.columns}")
print(f"合并前的 flight_no 唯一值: {comfort_df['flight_no'].unique()}")
# 创建一个临时列来存储用于匹配的航班号
self.df['match_flight_no'] = self.df['operateFlightNo'].fillna(self.df['flightNo'])
# 使用 left join 来合并数据
self.df = self.df.merge(comfort_df, left_on='match_flight_no', right_on='flight_no', how='left')
@ -1090,7 +1111,8 @@ class DataFetcher(object):
print(f"合并后的数据列: {self.df.columns}")
# 删除临时列和多余的flight_no列
self.df.drop(['match_flight_no', 'flight_no'], axis=1, inplace=True)
self.df.drop(['match_flight_no', 'flight_no'], axis=1, inplace=True, errors='ignore')
if rename_col:
# 对pandas的columns进行重命名
@ -1305,12 +1327,12 @@ class DataFetcher(object):
comfort_df.rename(columns={'index': 'flight_no'}, inplace=True)
# 保存舒适度数据为CSV文件
save_dir = os.path.join(os.getcwd(), self.date, datetime.now().strftime("%Y-%m-%d"))
os.makedirs(save_dir, exist_ok=True)
# save_dir = os.path.join(os.getcwd(), self.date, datetime.now().strftime("%Y-%m-%d"))
# os.makedirs(save_dir, exist_ok=True)
comfort_filename = os.path.join(save_dir, f"{self.city[0]}-{self.city[1]}_comfort.csv")
comfort_df.to_csv(comfort_filename, encoding="UTF-8", index=False)
print(f"{time.strftime('%Y-%m-%d_%H-%M-%S')} 航班舒适度数据已保存到 {comfort_filename}")
# comfort_filename = os.path.join(save_dir, f"{self.city[0]}-{self.city[1]}_comfort.csv")
# comfort_df.to_csv(comfort_filename, encoding="UTF-8", index=False)
# print(f"{time.strftime('%Y-%m-%d_%H-%M-%S')} 航班舒适度数据已保存到 {comfort_filename}")
return comfort_data
else:

Loading…
Cancel
Save