修改了线程逻辑 并将默认线程改为16

master
wkyuu 3 years ago
parent 47a7ec505c
commit b07a07ca69

@ -169,6 +169,10 @@ redisconn = redis.Redis(host = '127.0.0.1', port = '6379', password = 'x', db =
在没有使用线程之前,完整跑完五个种类共(30 x 10 x 5 = 1500)条数据用时365s 在没有使用线程之前,完整跑完五个种类共(30 x 10 x 5 = 1500)条数据用时365s
使用线程数为5的情况下完整跑完五个种类共 1500条数据用时130s
使用线程数为16的情况下完整跑完五个种类共 1500条数据用时80s
## 参考链接 ## 参考链接
1[selenium+python自动化100-centos上搭建selenium启动chrome浏览器headless无界面模式](https://www.cnblogs.com/yoyoketang/p/11582012.html) 1[selenium+python自动化100-centos上搭建selenium启动chrome浏览器headless无界面模式](https://www.cnblogs.com/yoyoketang/p/11582012.html)
@ -179,7 +183,7 @@ redisconn = redis.Redis(host = '127.0.0.1', port = '6379', password = 'x', db =
4[Python将list逐行读入到csv文件中](https://blog.csdn.net/weixin_41068770/article/details/103145660) 4[Python将list逐行读入到csv文件中](https://blog.csdn.net/weixin_41068770/article/details/103145660)
5[详解pandas的read_csv方法](https://www.cnblogs.com/traditional/p/12514914.html) 5
6[python 3 实现定义跨模块的全局变量和使用](https://codeantenna.com/a/9YbdOKrrSJ) 6[python 3 实现定义跨模块的全局变量和使用](https://codeantenna.com/a/9YbdOKrrSJ)
@ -189,4 +193,6 @@ redisconn = redis.Redis(host = '127.0.0.1', port = '6379', password = 'x', db =
9[python + redis 实现 分布式队列任务](https://cloud.tencent.com/developer/article/1697383) 9[python + redis 实现 分布式队列任务](https://cloud.tencent.com/developer/article/1697383)
10 10[深入理解Python线程中join()函数](https://www.linuxidc.com/Linux/2019-03/157795.htm)
11

@ -65,20 +65,21 @@ class milkSpider(threading.Thread):
def __init__(self, name, url): def __init__(self, name, url):
threading.Thread.__init__(self) threading.Thread.__init__(self)
self.name = name self.name = name
self.url = url1 self.url = url
self.category = getCategory(url) self.category = getCategory(url)
self.response = downloader.getsource(self.url)
# self.response = "" # self.response = ""
def run(self): def run(self):
self.response = downloader.getsource(self.url)
threadLock.acquire() threadLock.acquire()
print("write2csv for '{}' will be started in 3 seconds....".format(self.url)) # print("write2csv for '{}' will be started in 3 seconds....".format(self.url))
time.sleep(3) print("write2csv for '{}' was started.".format(self.url))
# time.sleep(3)
write2csv(self.category, self.response) write2csv(self.category, self.response)
print("{} is done.".format(self.name)) print("{} is done.".format(self.name))
threadLock.release() threadLock.release()
def mainThread(threadlines = 5, flag = flag): # 线程数默认为3 def mainThread(threadlines = 16, flag = flag): # 线程数默认为3
try: try:
threads = [] threads = []
for index in range(1, threadlines + 1): for index in range(1, threadlines + 1):
@ -106,7 +107,9 @@ def mainThread(threadlines = 5, flag = flag): # 线程数默认为3
if __name__ == '__main__': if __name__ == '__main__':
if precheck(): if precheck():
start_time = time.time()
mainThread() mainThread()
print("Totally spend " + str(time.time() - start_time) + "secends")
print("done.") print("done.")
@ -134,3 +137,8 @@ def localtest(category): # 本地加载的源码测试
time.sleep(10) time.sleep(10)
print("page " + str(page) + " sleep over at " + time.ctime()) print("page " + str(page) + " sleep over at " + time.ctime())
page += 1 page += 1
def clearRedis(): # 用于清空Redis队列
while not isNullRedis():
redisconn.lpop(REDIS_LISTNAME)
print("Redis queue has cleared.")
Loading…
Cancel
Save