You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

93 lines
3.0 KiB

# -*- coding: utf-8 -*-
from concurrent.futures import ThreadPoolExecutor
import settings
import pipelines
import downloader
import redis
import os
# 全局设定
REDIS_HOST = settings.REDIS_HOST
REDIS_PORT = settings.REDIS_PORT
REDIS_PASSWORD = settings.REDIS_PASSWORD
REDIS_LISTNAME = settings.REDIS_LISTNAME
BASEURL = settings.BASEURL
FILENAME_CSV = settings.FILENAME_CSV
connection_pool = redis.ConnectionPool(host = REDIS_HOST, port = REDIS_PORT, password = REDIS_PASSWORD, decode_responses = True)
redisconn = redis.Redis(connection_pool = connection_pool)
def getCategory(url) -> str:
for urlstr in BASEURL.items():
if urlstr[1] in url: return urlstr[0]
print("can't get a valid baseurl! Check your settings.BASEURL.")
exit()
def geturlList(baseurl) -> list:
urlList = []
for i in range(1, 20, 2): # 爬取10页
url = baseurl + r"&page=" + str(i)
urlList.append(url)
return urlList
def save2Redis(): # 自动获取 settings.py 中的url存入到redis中
for category in BASEURL.items():
for eachurl in geturlList(category[1]):
redisconn.rpush(REDIS_LISTNAME, eachurl)
print("Save the urls for '{}' to Redis queue has done.".format(category[0]))
def isNullRedis() -> bool: # 判断redis中待处理的url为空
if redisconn.llen(REDIS_LISTNAME) == 0: return True
else: return False
def precheck() -> bool: # 检查redis队列情况
while redisconn.llen(REDIS_LISTNAME) == 0:
print("No queue was found!\nPush some urls to the queue using default settings.\nContinue [c] or Exit [q] ?")
check = str(input())
if check == 'c':
save2Redis()
return True
elif check == 'q':
print("Exit.")
exit()
else: print("invalid input!")
return True
def clearRedis(): # 用于清空Redis队列
while not isNullRedis():
redisconn.lpop(REDIS_LISTNAME)
print("Redis queue has cleared.")
def write2csv(category, response): # 写入csv文件
filename_csv = os.getcwd() + "\\Catalogues\\" + FILENAME_CSV.get(category)
pipelines.write2csv(response, filename_csv)
class milkSpider:
def __init__(self, url):
self.url = url
self.category = getCategory(url)
def go(self):
self.response = downloader.getsource(self.url)
print("write2csv for '{}' was started.".format(self.url))
write2csv(self.category, self.response)
def mainThread():
try:
with ThreadPoolExecutor(max_workers = 8) as thread:
while True:
if isNullRedis():
print("Redis queue is empty, no more threads will be started")
break
url = redisconn.lpop(REDIS_LISTNAME)
aSpider = milkSpider(url)
thread.submit(aSpider.go)
except BaseException as e:
print(e)
print("sth wrong in mainThread, check your Redis queue, main thread quit.")
exit()
if __name__ == '__main__':
clearRedis()