|
|
|
@ -1,9 +1,9 @@
|
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
|
|
|
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
|
|
|
import settings
|
|
|
|
|
import pipelines
|
|
|
|
|
import downloader
|
|
|
|
|
import threading
|
|
|
|
|
import redis
|
|
|
|
|
import time
|
|
|
|
|
import os
|
|
|
|
@ -16,9 +16,6 @@ REDIS_LISTNAME = settings.REDIS_LISTNAME
|
|
|
|
|
BASEURL = settings.BASEURL
|
|
|
|
|
FILENAME_CSV = settings.FILENAME_CSV
|
|
|
|
|
|
|
|
|
|
threadLock = threading.Lock()
|
|
|
|
|
threadlines = 16 # 默认调用16个线程,不要超过20
|
|
|
|
|
flag = 1 # 判断主线程
|
|
|
|
|
connection_pool = redis.ConnectionPool(host = REDIS_HOST, port = REDIS_PORT, password = REDIS_PASSWORD, decode_responses = True)
|
|
|
|
|
redisconn = redis.Redis(connection_pool = connection_pool)
|
|
|
|
|
|
|
|
|
@ -67,42 +64,26 @@ def write2csv(category, response): # 写入csv文件
|
|
|
|
|
filename_csv = os.getcwd() + "\\Catalogues\\" + FILENAME_CSV.get(category)
|
|
|
|
|
pipelines.write2csv(response, filename_csv)
|
|
|
|
|
|
|
|
|
|
class milkSpider(threading.Thread):
|
|
|
|
|
def __init__(self, name, url):
|
|
|
|
|
threading.Thread.__init__(self)
|
|
|
|
|
self.name = name
|
|
|
|
|
class milkSpider:
|
|
|
|
|
def __init__(self, url):
|
|
|
|
|
self.url = url
|
|
|
|
|
self.category = getCategory(url)
|
|
|
|
|
# self.response = ""
|
|
|
|
|
|
|
|
|
|
def run(self):
|
|
|
|
|
def go(self):
|
|
|
|
|
self.response = downloader.getsource(self.url)
|
|
|
|
|
threadLock.acquire()
|
|
|
|
|
print("write2csv for '{}' was started.".format(self.url))
|
|
|
|
|
write2csv(self.category, self.response)
|
|
|
|
|
print("{} is done.".format(self.name))
|
|
|
|
|
threadLock.release()
|
|
|
|
|
|
|
|
|
|
def mainThread(threadlines = threadlines, flag = flag): # 线程数默认为3
|
|
|
|
|
def mainThread():
|
|
|
|
|
try:
|
|
|
|
|
threads = []
|
|
|
|
|
for index in range(1, threadlines + 1):
|
|
|
|
|
if isNullRedis():
|
|
|
|
|
print("Redis queue is empty, no more threads will be started")
|
|
|
|
|
flag = 0
|
|
|
|
|
break
|
|
|
|
|
name = "Thread[" + str(index) + "]"
|
|
|
|
|
print("{} started... {}/{}".format(name, str(index), threadlines))
|
|
|
|
|
url = redisconn.lpop(REDIS_LISTNAME)
|
|
|
|
|
athread = milkSpider(name, url)
|
|
|
|
|
athread.start()
|
|
|
|
|
threads.append(athread)
|
|
|
|
|
|
|
|
|
|
for thread in threads:
|
|
|
|
|
thread.join()
|
|
|
|
|
|
|
|
|
|
if flag == 1:
|
|
|
|
|
mainThread(threadlines, flag)
|
|
|
|
|
with ThreadPoolExecutor(max_workers = 8) as thread:
|
|
|
|
|
while True:
|
|
|
|
|
if isNullRedis():
|
|
|
|
|
print("Redis queue is empty, no more threads will be started")
|
|
|
|
|
break
|
|
|
|
|
url = redisconn.lpop(REDIS_LISTNAME)
|
|
|
|
|
aSpider = milkSpider(url)
|
|
|
|
|
thread.submit(aSpider.go)
|
|
|
|
|
|
|
|
|
|
except BaseException as e:
|
|
|
|
|
print(e)
|
|
|
|
|