更新useragent(好像没什么用)

master
wkyuu 3 years ago
parent f0915223b0
commit e0f470fa65

@ -109,6 +109,7 @@ pip how selenium
from selenium import webdriver from selenium import webdriver
from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.options import Options
chrome_options = Options() chrome_options = Options()
# chrome_options.add_argument('lang=zh_CN.UTF-8') # 设置中文
chrome_options.add_argument('--headless') # 无界面 chrome_options.add_argument('--headless') # 无界面
chrome_options.add_argument('--no-sandbox') # 解决DevToolsActivePort文件不存在报错问题 chrome_options.add_argument('--no-sandbox') # 解决DevToolsActivePort文件不存在报错问题
chrome_options.add_argument('--disable-gpu') # 禁用GPU硬件加速。如果软件渲染器没有就位则GPU进程将不会启动。 chrome_options.add_argument('--disable-gpu') # 禁用GPU硬件加速。如果软件渲染器没有就位则GPU进程将不会启动。
@ -149,9 +150,17 @@ html1 = etree.parse('test.html',etree.HTMLParser(encoding='utf-8'))
``` ```
### ChromeDriver 请求头cookie等
下载 [ChromeDriver](https://chromedriver.chromium.org/home) 放到 python 根目录就行 ```python
# 访问 https://httpbin.org/get?show_env=1 可以返回当前浏览器的请求信息
options.add_argument('lang=zh_CN.UTF-8')
```
ChromeDriver
下载 [ChromeDriver](https://chromedriver.chromium.org/home) 放到当前目录就行(如果是放在 python 根目录可以不用在实例化 selenium 时指定chromedriver 路径)
### Redis ### Redis
@ -199,4 +208,4 @@ redisconn = redis.Redis(host = '127.0.0.1', port = '6379', password = 'x', db =
11[如何理解Python装饰器- 知乎](https://www.zhihu.com/question/26930016/answer/360300235) 11[如何理解Python装饰器- 知乎](https://www.zhihu.com/question/26930016/answer/360300235)
12 12[【自动化】selenium设置请求头](https://www.jianshu.com/p/419eb4e00963)

@ -3,23 +3,25 @@
from selenium import webdriver from selenium import webdriver
from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.options import Options
from lxml import etree from lxml import etree
import random
import settings
headers = { headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36' 'User-Agent': random.choice(settings.USER_AGENT)
} }
def getsource(url): def getsource(url):
init = Options() initChrome = Options()
init.add_argument('--no-sandbox') initChrome.add_argument('--no-sandbox')
init.add_argument('--headless') initChrome.add_argument('--headless')
init.add_argument('--disable-gpu') initChrome.add_argument('--disable-gpu')
init.add_argument("disable-cache") initChrome.add_argument("disable-cache")
init.add_argument('disable-infobars') initChrome.add_argument('disable-infobars')
init.add_argument('log-level=3') # INFO = 0 WARNING = 1 LOG_ERROR = 2 LOG_FATAL = 3 default is 0 initChrome.add_argument('log-level=3') # INFO = 0 WARNING = 1 LOG_ERROR = 2 LOG_FATAL = 3 default is 0
init.add_experimental_option("excludeSwitches",['enable-automation','enable-logging']) initChrome.add_experimental_option("excludeSwitches",['enable-automation','enable-logging'])
driver = webdriver.Chrome(chrome_options = init) driver = webdriver.Chrome(chrome_options = initChrome, executable_path = './chromedriver.exe')
driver.implicitly_wait(10) driver.implicitly_wait(10)
driver.get(url) driver.get(url)

@ -29,3 +29,11 @@ REDIS_HOST = 'tencentCloud'
REDIS_PORT = '6379' REDIS_PORT = '6379'
REDIS_PASSWORD = 'root' REDIS_PASSWORD = 'root'
REDIS_LISTNAME = "urlList" REDIS_LISTNAME = "urlList"
# 下载器相关配置
USER_AGENT = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:97.0) Gecko/20100101 Firefox/97.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Safari/537.36 Edge/13.10586',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0'
]
Loading…
Cancel
Save