From 8ca463292a0b4546391c2cc2f6d7142a390b2864 Mon Sep 17 00:00:00 2001 From: wkyuu Date: Sun, 17 Apr 2022 14:22:39 +0800 Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E5=8E=86=E5=8F=B2=E4=BB=B7?= =?UTF-8?q?=E6=A0=BC=E6=AF=94=E5=AF=B9=E9=A1=B9=E7=9B=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 36 ++++++++++++++++++++++++--- downloader.py | 67 ++++++++------------------------------------------ middlewares.py | 13 +++++----- 3 files changed, 50 insertions(+), 66 deletions(-) diff --git a/README.md b/README.md index 47cc18d..02ce24d 100644 --- a/README.md +++ b/README.md @@ -150,15 +150,43 @@ html1 = etree.parse('test.html',etree.HTMLParser(encoding='utf-8')) # 访问 https://httpbin.org/get?show_env=1 可以返回当前浏览器的请求信息 options.add_argument('lang=zh_CN.UTF-8') +# 贴一个用json模块保存cookies +def getCookies(): + with open('cookies.json', 'r', encoding='utf-8') as fd: + listCookies = json.loads(fd.read()) + for cookie in listCookies: + cookies = { + 'domain': cookie['domain'], + 'httpOnly': cookie['httpOnly'], + 'name':cookie['name'], + 'path':'/', + 'secure': cookie['secure'], + 'value':cookie['value'], + } + print(cookies) + +def saveCookies(driver): + jsonCookies = json.dumps(driver.get_cookies()) + with open('cookies.json', 'w', encoding='utf-8') as fd: + fd.write(jsonCookies) + fd.close() + ``` ChromeDriver 下载 [ChromeDriver](https://chromedriver.chromium.org/home) 放到当前目录就行(如果是放在 python 根目录可以不用在实例化 selenium 时指定chromedriver 路径) -### Redis +### Requests -[介绍,配置](C:\Users\wkyuu\Desktop\my\SQL\Redis\Redis - NoSql高速缓存数据库.md) +经典老碟 + +```python +import requests + +``` + +### Redis ```python # 安装 redis 模块 @@ -208,4 +236,6 @@ redisconn = redis.Redis(host = '127.0.0.1', port = '6379', password = 'x', db = 14,[Selenium:添加Cookie的方法](https://cloud.tencent.com/developer/article/1616175) -15, \ No newline at end of file +15, + +16, \ No newline at end of file diff --git a/downloader.py b/downloader.py index 8330ab3..4b6dad8 100644 --- a/downloader.py +++ b/downloader.py @@ -1,12 +1,12 @@ # -*- coding: utf-8 -*- -import json from selenium import webdriver from selenium.webdriver.chrome.options import Options from lxml import etree import random import settings -import time +import requests +import json headers = { 'User-Agent': random.choice(settings.USER_AGENT) @@ -34,61 +34,14 @@ def getsource(url): driver.close() return response -def manmanbuy(url): - initChrome = Options() - ''' - initChrome.add_argument('--no-sandbox') - initChrome.add_argument('--headless') - initChrome.add_argument('--disable-gpu') - initChrome.add_argument("disable-cache") - initChrome.add_argument('disable-infobars') - initChrome.add_argument('log-level=3') # INFO = 0 WARNING = 1 LOG_ERROR = 2 LOG_FATAL = 3 default is 0 - initChrome.add_experimental_option("excludeSwitches",['enable-automation','enable-logging']) - ''' +def useRequests(url): + pass - # driver = webdriver.Chrome(chrome_options = initChrome, executable_path = './chromedriver.exe') - driver = webdriver.Chrome(executable_path = './chromedriver.exe') - # driver.get(url) - # time.sleep(10) - with open('cookies.json', 'r', encoding='utf-8') as fd: - listCookies = json.loads(fd.read()) - for cookie in listCookies: - cookies = { - 'domain': cookie['domain'], - 'httpOnly': cookie['httpOnly'], - 'name':cookie['name'], - 'path':'/', - 'secure': cookie['secure'], - 'value':cookie['value'], - } - print(cookies) - driver.add_cookie(cookies) - driver.get(url) - time.sleep(10) - exit() - driver.implicitly_wait(10) - driver.get(url) - - response = etree.HTML(driver.page_source) - response = etree.tostring(response, encoding = "utf-8", pretty_print = True, method = "html") - response = response.decode('utf-8') - - driver.close() - return response - -def saveCookies(driver): - jsonCookies = json.dumps(driver.get_cookies()) - with open('cookies.json', 'w', encoding='utf-8') as fd: - fd.write(jsonCookies) - fd.close() - - -def buy(): - jdurl = "https://item.jd.com/10047511027349.html" - url = "https://tool.manmanbuy.com/HistoryLowest.aspx?url=" + jdurl - # print(url) - response = manmanbuy(url) - print(response) if __name__ == "__main__": - buy() \ No newline at end of file + jdurl = "https://item.jd.com/10036840192083.html" + url = "https://www.vveby.com/search?keyword=" + jdurl + with open('historyPrice.html', 'w+', encoding = 'utf-8') as fd: + fd.write(getsource(url)) + fd.close() + print('done') \ No newline at end of file diff --git a/middlewares.py b/middlewares.py index 52a5893..c0f568b 100644 --- a/middlewares.py +++ b/middlewares.py @@ -44,7 +44,7 @@ def isNullRedis() -> bool: # 判断redis中待处理的url为空 if redisconn.llen(REDIS_LISTNAME) == 0: return True else: return False -def precheck() -> bool: # +def precheck() -> bool: # 检查redis队列情况 while redisconn.llen(REDIS_LISTNAME) == 0: print("No queue was found!\nPush some urls to the queue using default settings.\nContinue [c] or Exit [q] ?") check = str(input()) @@ -57,6 +57,11 @@ def precheck() -> bool: # else: print("invalid input!") return True +def clearRedis(): # 用于清空Redis队列 + while not isNullRedis(): + redisconn.lpop(REDIS_LISTNAME) + print("Redis queue has cleared.") + def write2csv(category, response): # 写入csv文件 filename_csv = os.getcwd() + "\\Catalogues\\" + FILENAME_CSV.get(category) pipelines.write2csv(response, filename_csv) @@ -106,7 +111,7 @@ def mainThread(threadlines = 16, flag = flag): # 线程数默认为3 exit() if __name__ == '__main__': - pass + clearRedis() @@ -133,7 +138,3 @@ def localtest(category): # 本地加载的源码测试 print("page " + str(page) + " sleep over at " + time.ctime()) page += 1 -def clearRedis(): # 用于清空Redis队列 - while not isNullRedis(): - redisconn.lpop(REDIS_LISTNAME) - print("Redis queue has cleared.") \ No newline at end of file