diff --git a/README.md b/README.md index 220486e..3bd99f1 100644 --- a/README.md +++ b/README.md @@ -169,6 +169,10 @@ redisconn = redis.Redis(host = '127.0.0.1', port = '6379', password = 'x', db = 在没有使用线程之前,完整跑完五个种类共(30 x 10 x 5 = 1500)条数据,用时365s +使用线程数为5的情况下,完整跑完五个种类共 1500条数据,用时130s + +使用线程数为16的情况下,完整跑完五个种类共 1500条数据,用时80s + ## 参考链接 1,[selenium+python自动化100-centos上搭建selenium启动chrome浏览器headless无界面模式](https://www.cnblogs.com/yoyoketang/p/11582012.html) @@ -179,7 +183,7 @@ redisconn = redis.Redis(host = '127.0.0.1', port = '6379', password = 'x', db = 4,[Python将list逐行读入到csv文件中](https://blog.csdn.net/weixin_41068770/article/details/103145660) -5,[详解pandas的read_csv方法](https://www.cnblogs.com/traditional/p/12514914.html) +5, 6,[python 3 实现定义跨模块的全局变量和使用](https://codeantenna.com/a/9YbdOKrrSJ) @@ -189,4 +193,6 @@ redisconn = redis.Redis(host = '127.0.0.1', port = '6379', password = 'x', db = 9,[python + redis 实现 分布式队列任务](https://cloud.tencent.com/developer/article/1697383) -10, \ No newline at end of file +10,[深入理解Python线程中join()函数](https://www.linuxidc.com/Linux/2019-03/157795.htm) + +11, \ No newline at end of file diff --git a/middlewares.py b/middlewares.py index 9884486..7e7687b 100644 --- a/middlewares.py +++ b/middlewares.py @@ -65,20 +65,21 @@ class milkSpider(threading.Thread): def __init__(self, name, url): threading.Thread.__init__(self) self.name = name - self.url = url1 + self.url = url self.category = getCategory(url) - self.response = downloader.getsource(self.url) # self.response = "" def run(self): + self.response = downloader.getsource(self.url) threadLock.acquire() - print("write2csv for '{}' will be started in 3 seconds....".format(self.url)) - time.sleep(3) + # print("write2csv for '{}' will be started in 3 seconds....".format(self.url)) + print("write2csv for '{}' was started.".format(self.url)) + # time.sleep(3) write2csv(self.category, self.response) print("{} is done.".format(self.name)) threadLock.release() -def mainThread(threadlines = 5, flag = flag): # 线程数默认为3 +def mainThread(threadlines = 16, flag = flag): # 线程数默认为3 try: threads = [] for index in range(1, threadlines + 1): @@ -106,7 +107,9 @@ def mainThread(threadlines = 5, flag = flag): # 线程数默认为3 if __name__ == '__main__': if precheck(): + start_time = time.time() mainThread() + print("Totally spend " + str(time.time() - start_time) + "secends") print("done.") @@ -134,3 +137,8 @@ def localtest(category): # 本地加载的源码测试 time.sleep(10) print("page " + str(page) + " sleep over at " + time.ctime()) page += 1 + +def clearRedis(): # 用于清空Redis队列 + while not isNullRedis(): + redisconn.lpop(REDIS_LISTNAME) + print("Redis queue has cleared.") \ No newline at end of file