重写了connect通信程序与服务器通信系统,彻底重写了终端节点集群,对整个系统进行了较大幅度的优化,优化集群为多进程,增加轮询间隔,小优化

master
wufayuan 2 years ago
parent 58c2162918
commit 06e1f4c565

@ -115,7 +115,16 @@ class Client(multiprocessing.Process):
if request_map['type'] == 'request':
logger.info("[REQUEST] receiving help request: " + json.dumps(request_map, ensure_ascii=False))
response_map = crawl(request_map)
try:
response_map = crawl(request_map)
except Exception as e:
logger.error(f'[Error] {e.__class__.__name__}: {str(e)}')
try:
response_map = crawl(request_map)
except Exception as e:
logger.error(f'[Error] {e.__class__.__name__}: {str(e)}')
# 爬取失败
response_map = {'0': {'name': None, 'college': None, 'major': None, 'title': None}, 'failed_task': {request_map}, 'success': False}
response_map.update({'cookie': request_map['cookie']})
client_socket.sendall(generate_response(response_map))
logger.info(f'[RESPONSE] sending client result {response_map}...')
@ -132,7 +141,7 @@ class Client(multiprocessing.Process):
break
except Exception as e:
logger.error(str(e))
logger.error(f'[Error] {e.__class__.__name__}: {str(e)}')
if __name__ == '__main__':

File diff suppressed because one or more lines are too long

@ -1,3 +1,5 @@
import csv
import random
import socket
import threading
from typing import Optional
@ -189,6 +191,7 @@ class Spider_task(threading.Thread):
if task.is_partial_task_crawl_completely():
continue
else:
random.shuffle(free_remote_nodes)
current_task_thread = task.thread
if current_task_thread is None:
for f_node in free_remote_nodes:

@ -1,14 +1,11 @@
# 知网论文数据爬取
import csv
from bs4 import BeautifulSoup
import time
import requests
import requests
from bs4 import BeautifulSoup
# 定义论文类
from loguru import logger
from msedge.selenium_tools import webdriver
from msedge.selenium_tools.service import Service
from selenium.webdriver.common.by import By
@ -39,7 +36,7 @@ def driver_open(driver, key_word):
# 点击搜索按钮
driver.find_element(by=By.CSS_SELECTOR,
value='body > div.wrapper.section1 > div.searchmain > div > div.input-box > input.search-btn').click()
time.sleep(1) # 必须要等待
time.sleep(1.5) # 必须要等待
content = driver.page_source.encode('utf-8')
# driver.close()
soup = BeautifulSoup(content, 'lxml')

File diff suppressed because it is too large Load Diff
Loading…
Cancel
Save