|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
|
|
|
import random
|
|
|
|
|
|
|
|
import requests
|
|
|
|
from lxml import etree
|
|
|
|
from selenium import webdriver
|
|
|
|
from selenium.webdriver.chrome.options import Options
|
|
|
|
|
|
|
|
import settings
|
|
|
|
|
|
|
|
headers = {
|
|
|
|
'User-Agent': random.choice(settings.USER_AGENT)
|
|
|
|
# 'User-Agent': settings.USER_AGENT[1]
|
|
|
|
}
|
|
|
|
|
|
|
|
def getsource(url):
|
|
|
|
initChrome = Options()
|
|
|
|
|
|
|
|
initChrome.add_argument('--no-sandbox')
|
|
|
|
initChrome.add_argument('--headless')
|
|
|
|
initChrome.add_argument('--disable-gpu')
|
|
|
|
initChrome.add_argument("disable-cache")
|
|
|
|
initChrome.add_argument('disable-infobars')
|
|
|
|
initChrome.add_argument('log-level=3') # INFO = 0 WARNING = 1 LOG_ERROR = 2 LOG_FATAL = 3 default is 0
|
|
|
|
initChrome.add_experimental_option("excludeSwitches",['enable-automation','enable-logging'])
|
|
|
|
|
|
|
|
driver = webdriver.Chrome(chrome_options = initChrome, executable_path = './chromedriver.exe')
|
|
|
|
driver.implicitly_wait(10)
|
|
|
|
driver.get(url)
|
|
|
|
|
|
|
|
response = etree.HTML(driver.page_source)
|
|
|
|
response = etree.tostring(response, encoding = "utf-8", pretty_print = True, method = "html").decode('utf-8')
|
|
|
|
|
|
|
|
driver.close()
|
|
|
|
return response
|
|
|
|
|
|
|
|
def useRequests(url):
|
|
|
|
try:
|
|
|
|
session = requests.Session()
|
|
|
|
res = session.get(url, headers = headers)
|
|
|
|
# print(res.request.headers)
|
|
|
|
res.encoding = res.apparent_encoding
|
|
|
|
res = etree.HTML(res.text)
|
|
|
|
source = etree.tostring(res, encoding = 'utf-8', pretty_print = True, method = 'html').decode('utf-8')
|
|
|
|
return source
|
|
|
|
except BaseException as e:
|
|
|
|
print(e)
|
|
|
|
print("sth wrong in your downloader.useRequests. Exiting...")
|
|
|
|
exit()
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
jdurl = r"https://item.jd.com/10023043997421.html"
|
|
|
|
url = r"https://www.vveby.com/search?keyword=" + jdurl
|
|
|
|
print(url)
|
|
|
|
useRequests(url)
|
|
|
|
print('done')
|