Chuzhuoli_branch
Lin 2 months ago
parent 639478f03b
commit 4666433316

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2024 Suysker
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

@ -0,0 +1,50 @@
# Ctrip-Crawler
## 概述
Ctrip-Crawler 是一个携程航班信息的专业爬虫工具,主要基于 Selenium 框架进行实现。
request 方法访问携程 API 的方法,由于 IP 限制和 JS 逆向工程的挑战,该途径已不再适用。(报错)
携程支持IPV6访问因此可以通过生成大量IPV6规避 IP 限制。
## 主要特性
Selenium 自动化框架:与直接请求 API 的方法不同,该项目基于 Selenium提供高度可定制和交互式的浏览器模拟。
灵活的错误处理机制:针对不同类型的异常(如超时、验证码出现、未知错误等),实施相应的处理策略,包括重试和人工干预。
IP限制解决方案利用页面特性和用户模拟规避了 IP 限制,提高了爬取稳定性。
数据校验与解析:对获取的数据进行严格的数据质量和完整性校验,包括 gzip 解压缩和 JSON 格式解析。
版本迭代与优化V2版本解决了验证码问题V3版本提高了系统的稳定性和可用性V3.5版本增加了linux系统下多IPV6网口的生成与代理
## 文档和教程
详细的使用指南和开发文档可在以下博客中查看:
[基于selenium的携程机票爬取程序](https://blog.suysker.xyz/archives/35)
[基于selenium的携程机票爬取程序V2](https://blog.suysker.xyz/archives/139)
[基于request的携程机票爬取程序](https://blog.suysker.xyz/archives/37)
[基于request的航班历史票价爬取](https://blog.suysker.xyz/archives/36)
## TO DO
V4.0增加多线程分片运行……
## 贡献与反馈
如果你有更好的优化建议或发现任何 bug请通过 Issues 或 Pull Requests 与我们交流。我们非常欢迎各种形式的贡献!

@ -0,0 +1,136 @@
import magic
import io
import os
import gzip
import time
import json
import requests
import pandas as pd
from seleniumwire import webdriver
from datetime import datetime as dt, timedelta
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
# 爬取的城市
crawal_citys = ["上海", "香港", "东京"]
# 爬取日期范围:起始日期。格式'2023-12-01'
begin_date = None
# 爬取日期范围:结束日期。格式'2023-12-31'
end_date = None
# 爬取T+N即N天后
start_interval = 1
# 爬取的日期
crawal_days = 60
# 设置各城市爬取的时间间隔(单位:秒)
crawal_interval = 5
# 日期间隔
days_interval = 1
# 设置页面加载的最长等待时间(单位:秒)
max_wait_time = 10
# 最大错误重试次数
max_retry_time = 5
# 是否只抓取直飞信息True: 只抓取直飞False: 抓取所有航班)
direct_flight = True
# 是否删除不重要的信息
del_info = False
# 是否重命名DataFrame的列名
rename_col = True
# 调试截图
enable_screenshot = False
# 允许登录(可能必须要登录才能获取数据)
login_allowed = True
# 账号
accounts = ['','']
# 密码
passwords = ['','']
#利用stealth.min.js隐藏selenium特征
stealth_js_path='./stealth.min.js'
# 定义下载stealth.min.js的函数
def download_stealth_js(file_path, url='https://raw.githubusercontent.com/requireCool/stealth.min.js/main/stealth.min.js'):
if not os.path.exists(file_path):
print(f"{file_path} not found, downloading...")
response = requests.get(url)
response.raise_for_status() # 确保请求成功
with open(file_path, 'w') as file:
file.write(response.text)
print(f"{file_path} downloaded.")
else:
print(f"{file_path} already exists, no need to download.")
def init_driver():
# options = webdriver.ChromeOptions() # 创建一个配置对象
options = webdriver.EdgeOptions() # 创建一个配置对象
options.add_argument("--incognito") # 隐身模式(无痕模式)
# options.add_argument('--headless') # 启用无头模式
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--disable-blink-features")
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_argument("--disable-extensions")
options.add_argument("--pageLoadStrategy=eager")
options.add_argument("--disable-gpu")
options.add_argument("--disable-software-rasterizer")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--ignore-certificate-errors")
options.add_argument("--ignore-certificate-errors-spki-list")
options.add_argument("--ignore-ssl-errors")
options.add_experimental_option("excludeSwitches", ["enable-automation"]) # 不显示正在受自动化软件控制的提示
# chromeDriverPath = 'C:/Program Files/Google/Chrome/Application/chromedriver' #chromedriver位置
# options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.69")
# driver = webdriver.Chrome(executable_path=self.chromeDriverPath,chrome_options=self.options)
driver = webdriver.Edge(options=options)
try:
download_stealth_js(stealth_js_path)
# 读取并注入stealth.min.js
with open(stealth_js_path, 'r') as file:
stealth_js = file.read()
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {"source": stealth_js})
except Exception as e:
print(e)
driver.maximize_window()
return driverdef element_to_be_clickable(element):
def check_clickable(driver):
try:
if element.is_enabled() and element.is_displayed():
return element # 当条件满足时,返回元素本身
else:
return False
except:
return False
return check_clickable
print(f'\n{time.strftime("%Y-%m-%d_%H-%M-%S")} 程序运行完成!!!!')

File diff suppressed because one or more lines are too long
Loading…
Cancel
Save