milkSpider/middlewares.py

# -*- coding: utf-8 -*-

from concurrent.futures import ThreadPoolExecutor
import settings
import pipelines
import downloader
import redis
import os

# 全局设定
REDIS_HOST = settings.REDIS_HOST
REDIS_PORT = settings.REDIS_PORT
REDIS_PASSWORD = settings.REDIS_PASSWORD
REDIS_LISTNAME = settings.REDIS_LISTNAME
BASEURL = settings.BASEURL
FILENAME_CSV = settings.FILENAME_CSV

connection_pool = redis.ConnectionPool(host = REDIS_HOST, port = REDIS_PORT, password = REDIS_PASSWORD, decode_responses = True)
redisconn = redis.Redis(connection_pool = connection_pool)

def getCategory(url) -> str:
    for urlstr in BASEURL.items():
        if urlstr[1] in url: return urlstr[0]
    print("can't get a valid baseurl! Check your settings.BASEURL.")
    exit()

def geturlList(baseurl) -> list:
    urlList = []
    for i in range(1, 20, 2):   # 爬取10页
        url = baseurl + r"&page=" + str(i)
        urlList.append(url)
    return urlList

def save2Redis():   # 自动获取 settings.py 中的url存入到redis中
    for category in BASEURL.items():
        for eachurl in geturlList(category[1]):
            redisconn.rpush(REDIS_LISTNAME, eachurl)
        print("Save the urls for '{}' to Redis queue has done.".format(category[0]))

def isNullRedis() -> bool: # 判断redis中待处理的url为空
    if redisconn.llen(REDIS_LISTNAME) == 0: return True
    else: return False

def precheck() -> bool: # 检查redis队列情况
    while redisconn.llen(REDIS_LISTNAME) == 0:
        print("No queue was found!\nPush some urls to the queue using default settings.\nContinue [c] or Exit [q] ?")
        check = str(input())
        if check == 'c':
            save2Redis()
            return True
        elif check == 'q':
            print("Exit.")
            exit()
        else: print("invalid input!")
    return True

def clearRedis():   # 用于清空Redis队列
    while not isNullRedis():
        redisconn.lpop(REDIS_LISTNAME)
    print("Redis queue has cleared.")

def write2csv(category, response):    # 写入csv文件
    filename_csv = os.getcwd() + "\\Catalogues\\" + FILENAME_CSV.get(category)
    pipelines.write2csv(response, filename_csv)

class milkSpider:
    def __init__(self, url):
        self.url = url
        self.category = getCategory(url)

    def go(self):
        self.response = downloader.getsource(self.url)
        print("write2csv for '{}' was started.".format(self.url))
        write2csv(self.category, self.response)

def mainThread():
    try:
        with ThreadPoolExecutor(max_workers = 8) as thread:
            while True:
                if isNullRedis():
                    print("Redis queue is empty, no more threads will be started")
                    break
                url = redisconn.lpop(REDIS_LISTNAME)
                aSpider = milkSpider(url)
                thread.submit(aSpider.go)

    except BaseException as e:
        print(e)
        print("sth wrong in mainThread, check your Redis queue, main thread quit.")
        exit()

if __name__ == '__main__':
    clearRedis()