diff --git a/README.md b/README.md index cbd18e0..693e893 100644 --- a/README.md +++ b/README.md @@ -109,6 +109,7 @@ pip how selenium from selenium import webdriver from selenium.webdriver.chrome.options import Options chrome_options = Options() +# chrome_options.add_argument('lang=zh_CN.UTF-8') # 设置中文 chrome_options.add_argument('--headless') # 无界面 chrome_options.add_argument('--no-sandbox') # 解决DevToolsActivePort文件不存在报错问题 chrome_options.add_argument('--disable-gpu') # 禁用GPU硬件加速。如果软件渲染器没有就位,则GPU进程将不会启动。 @@ -149,9 +150,17 @@ html1 = etree.parse('test.html',etree.HTMLParser(encoding='utf-8')) ``` -### ChromeDriver +请求头,cookie等 -下载 [ChromeDriver](https://chromedriver.chromium.org/home) 放到 python 根目录就行 +```python +# 访问 https://httpbin.org/get?show_env=1 可以返回当前浏览器的请求信息 +options.add_argument('lang=zh_CN.UTF-8') + +``` + +ChromeDriver + +下载 [ChromeDriver](https://chromedriver.chromium.org/home) 放到当前目录就行(如果是放在 python 根目录可以不用在实例化 selenium 时指定chromedriver 路径) ### Redis @@ -199,4 +208,4 @@ redisconn = redis.Redis(host = '127.0.0.1', port = '6379', password = 'x', db = 11,[如何理解Python装饰器?- 知乎](https://www.zhihu.com/question/26930016/answer/360300235) -12, \ No newline at end of file +12,[【自动化】selenium设置请求头](https://www.jianshu.com/p/419eb4e00963) \ No newline at end of file diff --git a/downloader.py b/downloader.py index 06e236b..2bba65b 100644 --- a/downloader.py +++ b/downloader.py @@ -3,23 +3,25 @@ from selenium import webdriver from selenium.webdriver.chrome.options import Options from lxml import etree +import random +import settings headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36' + 'User-Agent': random.choice(settings.USER_AGENT) } def getsource(url): - init = Options() + initChrome = Options() - init.add_argument('--no-sandbox') - init.add_argument('--headless') - init.add_argument('--disable-gpu') - init.add_argument("disable-cache") - init.add_argument('disable-infobars') - init.add_argument('log-level=3') # INFO = 0 WARNING = 1 LOG_ERROR = 2 LOG_FATAL = 3 default is 0 - init.add_experimental_option("excludeSwitches",['enable-automation','enable-logging']) + initChrome.add_argument('--no-sandbox') + initChrome.add_argument('--headless') + initChrome.add_argument('--disable-gpu') + initChrome.add_argument("disable-cache") + initChrome.add_argument('disable-infobars') + initChrome.add_argument('log-level=3') # INFO = 0 WARNING = 1 LOG_ERROR = 2 LOG_FATAL = 3 default is 0 + initChrome.add_experimental_option("excludeSwitches",['enable-automation','enable-logging']) - driver = webdriver.Chrome(chrome_options = init) + driver = webdriver.Chrome(chrome_options = initChrome, executable_path = './chromedriver.exe') driver.implicitly_wait(10) driver.get(url) diff --git a/settings.py b/settings.py index 01db6e4..37dfb46 100644 --- a/settings.py +++ b/settings.py @@ -28,4 +28,12 @@ FILEPATH = { REDIS_HOST = 'tencentCloud' REDIS_PORT = '6379' REDIS_PASSWORD = 'root' -REDIS_LISTNAME = "urlList" \ No newline at end of file +REDIS_LISTNAME = "urlList" + +# 下载器相关配置 +USER_AGENT = [ + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:97.0) Gecko/20100101 Firefox/97.0', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Safari/537.36 Edge/13.10586', + 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36', + 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0' +] \ No newline at end of file