You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
93 lines
3.0 KiB
93 lines
3.0 KiB
# -*- coding: utf-8 -*-
|
|
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
import settings
|
|
import pipelines
|
|
import downloader
|
|
import redis
|
|
import os
|
|
|
|
# 全局设定
|
|
REDIS_HOST = settings.REDIS_HOST
|
|
REDIS_PORT = settings.REDIS_PORT
|
|
REDIS_PASSWORD = settings.REDIS_PASSWORD
|
|
REDIS_LISTNAME = settings.REDIS_LISTNAME
|
|
BASEURL = settings.BASEURL
|
|
FILENAME_CSV = settings.FILENAME_CSV
|
|
|
|
connection_pool = redis.ConnectionPool(host = REDIS_HOST, port = REDIS_PORT, password = REDIS_PASSWORD, decode_responses = True)
|
|
redisconn = redis.Redis(connection_pool = connection_pool)
|
|
|
|
def getCategory(url) -> str:
|
|
for urlstr in BASEURL.items():
|
|
if urlstr[1] in url: return urlstr[0]
|
|
print("can't get a valid baseurl! Check your settings.BASEURL.")
|
|
exit()
|
|
|
|
def geturlList(baseurl) -> list:
|
|
urlList = []
|
|
for i in range(1, 20, 2): # 爬取10页
|
|
url = baseurl + r"&page=" + str(i)
|
|
urlList.append(url)
|
|
return urlList
|
|
|
|
def save2Redis(): # 自动获取 settings.py 中的url存入到redis中
|
|
for category in BASEURL.items():
|
|
for eachurl in geturlList(category[1]):
|
|
redisconn.rpush(REDIS_LISTNAME, eachurl)
|
|
print("Save the urls for '{}' to Redis queue has done.".format(category[0]))
|
|
|
|
def isNullRedis() -> bool: # 判断redis中待处理的url为空
|
|
if redisconn.llen(REDIS_LISTNAME) == 0: return True
|
|
else: return False
|
|
|
|
def precheck() -> bool: # 检查redis队列情况
|
|
while redisconn.llen(REDIS_LISTNAME) == 0:
|
|
print("No queue was found!\nPush some urls to the queue using default settings.\nContinue [c] or Exit [q] ?")
|
|
check = str(input())
|
|
if check == 'c':
|
|
save2Redis()
|
|
return True
|
|
elif check == 'q':
|
|
print("Exit.")
|
|
exit()
|
|
else: print("invalid input!")
|
|
return True
|
|
|
|
def clearRedis(): # 用于清空Redis队列
|
|
while not isNullRedis():
|
|
redisconn.lpop(REDIS_LISTNAME)
|
|
print("Redis queue has cleared.")
|
|
|
|
def write2csv(category, response): # 写入csv文件
|
|
filename_csv = os.getcwd() + "\\Catalogues\\" + FILENAME_CSV.get(category)
|
|
pipelines.write2csv(response, filename_csv)
|
|
|
|
class milkSpider:
|
|
def __init__(self, url):
|
|
self.url = url
|
|
self.category = getCategory(url)
|
|
|
|
def go(self):
|
|
self.response = downloader.getsource(self.url)
|
|
print("write2csv for '{}' was started.".format(self.url))
|
|
write2csv(self.category, self.response)
|
|
|
|
def mainThread():
|
|
try:
|
|
with ThreadPoolExecutor(max_workers = 8) as thread:
|
|
while True:
|
|
if isNullRedis():
|
|
print("Redis queue is empty, no more threads will be started")
|
|
break
|
|
url = redisconn.lpop(REDIS_LISTNAME)
|
|
aSpider = milkSpider(url)
|
|
thread.submit(aSpider.go)
|
|
|
|
except BaseException as e:
|
|
print(e)
|
|
print("sth wrong in mainThread, check your Redis queue, main thread quit.")
|
|
exit()
|
|
|
|
if __name__ == '__main__':
|
|
clearRedis() |