milkSpider/middlewares.py

# -*- coding: utf-8 -*-

import settings
import pipelines
import downloader
import threading
import redis
import time
import os

# 全局设定
REDIS_HOST = settings.REDIS_HOST
REDIS_PORT = settings.REDIS_PORT
REDIS_PASSWORD = settings.REDIS_PASSWORD
REDIS_LISTNAME = settings.REDIS_LISTNAME
BASEURL = settings.BASEURL
FILENAME_CSV = settings.FILENAME_CSV

threadLock = threading.Lock()
threadlines = 16    # 默认调用16个线程，不要超过20
flag = 1    # 判断主线程
connection_pool = redis.ConnectionPool(host = REDIS_HOST, port = REDIS_PORT, password = REDIS_PASSWORD, decode_responses = True)
redisconn = redis.Redis(connection_pool = connection_pool)

def getCategory(url) -> str:
    for urlstr in BASEURL.items():
        if urlstr[1] in url: return urlstr[0]
    print("can't get a valid baseurl! Check your settings.BASEURL.")
    exit()

def geturlList(baseurl) -> list:
    urlList = []
    for i in range(1, 20, 2):   # 爬取10页
        url = baseurl + r"&page=" + str(i)
        urlList.append(url)
    return urlList

def save2Redis():   # 自动获取 settings.py 中的url存入到redis中
    for category in BASEURL.items():
        for eachurl in geturlList(category[1]):
            redisconn.rpush(REDIS_LISTNAME, eachurl)
        print("Save the urls for '{}' to Redis queue has done.".format(category[0]))

def isNullRedis() -> bool: # 判断redis中待处理的url为空
    if redisconn.llen(REDIS_LISTNAME) == 0: return True
    else: return False

def precheck() -> bool: # 检查redis队列情况
    while redisconn.llen(REDIS_LISTNAME) == 0:
        print("No queue was found!\nPush some urls to the queue using default settings.\nContinue [c] or Exit [q] ?")
        check = str(input())
        if check == 'c':
            save2Redis()
            return True
        elif check == 'q':
            print("Exit.")
            exit()
        else: print("invalid input!")
    return True

def clearRedis():   # 用于清空Redis队列
    while not isNullRedis():
        redisconn.lpop(REDIS_LISTNAME)
    print("Redis queue has cleared.")

def write2csv(category, response):    # 写入csv文件
    filename_csv = os.getcwd() + "\\Catalogues\\" + FILENAME_CSV.get(category)
    pipelines.write2csv(response, filename_csv)

class milkSpider(threading.Thread):
    def __init__(self, name, url):
        threading.Thread.__init__(self)
        self.name = name
        self.url = url
        self.category = getCategory(url)
        # self.response = ""

    def run(self):
        self.response = downloader.getsource(self.url)
        threadLock.acquire()
        print("write2csv for '{}' was started.".format(self.url))
        write2csv(self.category, self.response)
        print("{} is done.".format(self.name))
        threadLock.release()

def mainThread(threadlines = threadlines, flag = flag):    # 线程数默认为3
    try:
        threads = []
        for index in range(1, threadlines + 1):
            if isNullRedis():
                print("Redis queue is empty, no more threads will be started")
                flag = 0
                break
            name = "Thread[" + str(index) + "]"
            print("{} started... {}/{}".format(name, str(index), threadlines))
            url = redisconn.lpop(REDIS_LISTNAME)
            athread = milkSpider(name, url)
            athread.start()
            threads.append(athread)

        for thread in threads:
            thread.join()

        if flag == 1:
            mainThread(threadlines, flag)

    except BaseException as e:
        print(e)
        print("sth wrong in mainThread, check your Redis queue, main thread quit.")
        exit()

if __name__ == '__main__':
    clearRedis()


# 以下是本地测试
def print2console(response):    # 输出到命令行
    pipelines.print2console(response)

def localtest(category): # 本地加载的源码测试
    fileList = settings.getfileList(settings.FILEPATH.get(category))
    page = 1
    for filename in fileList:
        print("↓↓↓↓↓↓↓↓↓↓\npage " + str(page) + " start at " + time.ctime())
        print("正在爬取第 " + str(page) + " 页: " + filename)

        response = pipelines.gethtml(filename, gethtml_mode = "cache")  # 只用在这里设定一次就够了
        write2csv(response)

        print("page " + str(page) + " sleep at " + time.ctime())
        time.sleep(10)
        print("page " + str(page) + " sleep over at " + time.ctime())
        page += 1