# -*- coding: utf-8 -*- from concurrent.futures import ThreadPoolExecutor import settings import pipelines import downloader import redis import os # 全局设定 REDIS_HOST = settings.REDIS_HOST REDIS_PORT = settings.REDIS_PORT REDIS_PASSWORD = settings.REDIS_PASSWORD REDIS_LISTNAME = settings.REDIS_LISTNAME BASEURL = settings.BASEURL FILENAME_CSV = settings.FILENAME_CSV connection_pool = redis.ConnectionPool(host = REDIS_HOST, port = REDIS_PORT, password = REDIS_PASSWORD, decode_responses = True) redisconn = redis.Redis(connection_pool = connection_pool) def getCategory(url) -> str: for urlstr in BASEURL.items(): if urlstr[1] in url: return urlstr[0] print("can't get a valid baseurl! Check your settings.BASEURL.") exit() def geturlList(baseurl) -> list: urlList = [] for i in range(1, 20, 2): # 爬取10页 url = baseurl + r"&page=" + str(i) urlList.append(url) return urlList def save2Redis(): # 自动获取 settings.py 中的url存入到redis中 for category in BASEURL.items(): for eachurl in geturlList(category[1]): redisconn.rpush(REDIS_LISTNAME, eachurl) print("Save the urls for '{}' to Redis queue has done.".format(category[0])) def isNullRedis() -> bool: # 判断redis中待处理的url为空 if redisconn.llen(REDIS_LISTNAME) == 0: return True else: return False def precheck() -> bool: # 检查redis队列情况 while redisconn.llen(REDIS_LISTNAME) == 0: print("No queue was found!\nPush some urls to the queue using default settings.\nContinue [c] or Exit [q] ?") check = str(input()) if check == 'c': save2Redis() return True elif check == 'q': print("Exit.") exit() else: print("invalid input!") return True def clearRedis(): # 用于清空Redis队列 while not isNullRedis(): redisconn.lpop(REDIS_LISTNAME) print("Redis queue has cleared.") def write2csv(category, response): # 写入csv文件 filename_csv = os.getcwd() + "\\Catalogues\\" + FILENAME_CSV.get(category) pipelines.write2csv(response, filename_csv) class milkSpider: def __init__(self, url): self.url = url self.category = getCategory(url) def go(self): self.response = downloader.getsource(self.url) print("write2csv for '{}' was started.".format(self.url)) write2csv(self.category, self.response) def mainThread(): try: with ThreadPoolExecutor(max_workers = 8) as thread: while True: if isNullRedis(): print("Redis queue is empty, no more threads will be started") break url = redisconn.lpop(REDIS_LISTNAME) aSpider = milkSpider(url) thread.submit(aSpider.go) except BaseException as e: print(e) print("sth wrong in mainThread, check your Redis queue, main thread quit.") exit() if __name__ == '__main__': clearRedis()