Merge pull request '爬虫' (#3) from developer_wufayuan into master

developer_wufayuan
p3t2ja9zs 3 years ago
commit b51f5cbf93

@ -108,3 +108,31 @@
2022-03-17 17:09:25.025 | INFO | dcs.tests.requestHandler:run:31 - [RESPONSE] test: hello TEST, header size: 22 2022-03-17 17:09:25.025 | INFO | dcs.tests.requestHandler:run:31 - [RESPONSE] test: hello TEST, header size: 22
2022-03-17 17:09:28.188 | INFO | dcs.tests.requestHandler:run:46 - [REQUEST] crawl zhiwang 2022-03-17 17:09:28.188 | INFO | dcs.tests.requestHandler:run:46 - [REQUEST] crawl zhiwang
2022-03-17 17:09:28.189 | INFO | dcs.tests.spider:run:49 - crawling... 2022-03-17 17:09:28.189 | INFO | dcs.tests.spider:run:49 - crawling...
2022-03-22 15:07:24.669 | INFO | __main__:<module>:8 - reading config args...
2022-03-22 15:07:24.669 | INFO | __main__:<module>:15 - starting the server...
2022-03-22 15:07:32.481 | INFO | dcs.tests.requestHandler:run:21 - [REQUEST] test
2022-03-22 15:07:32.482 | INFO | dcs.tests.requestHandler:run:31 - [RESPONSE] test: hello TEST, header size: 22
2022-03-22 15:07:42.808 | INFO | dcs.tests.requestHandler:run:46 - [REQUEST] crawl zhiwang
2022-03-22 15:07:42.808 | INFO | dcs.tests.spider:run:49 - crawling...
2022-03-22 15:10:32.737 | INFO | __main__:<module>:8 - reading config args...
2022-03-22 15:10:32.738 | INFO | __main__:<module>:15 - starting the server...
2022-03-22 15:10:35.292 | INFO | dcs.tests.requestHandler:run:21 - [REQUEST] test
2022-03-22 15:10:35.292 | INFO | dcs.tests.requestHandler:run:31 - [RESPONSE] test: hello TEST, header size: 22
2022-03-22 15:10:39.090 | INFO | dcs.tests.requestHandler:run:46 - [REQUEST] crawl zhiwang
2022-03-22 15:10:39.090 | INFO | dcs.tests.spider:run:53 - crawling...
2022-03-22 16:20:35.470 | INFO | __main__:<module>:8 - reading config args...
2022-03-22 16:20:35.471 | INFO | __main__:<module>:15 - starting the server...
2022-03-22 16:20:53.358 | INFO | dcs.tests.requestHandler:run:47 - [REQUEST] crawl zhiwang
2022-03-22 16:20:53.359 | INFO | dcs.tests.spider:run:63 - crawling...
2022-03-22 16:22:01.164 | INFO | __main__:<module>:8 - reading config args...
2022-03-22 16:22:01.165 | INFO | __main__:<module>:15 - starting the server...
2022-03-22 16:22:04.720 | INFO | dcs.tests.requestHandler:run:62 - [REQUEST] report free
2022-03-22 16:22:04.721 | INFO | dcs.tests.requestHandler:run:74 - [RESPONSE] report free: success marked ['127.0.0.1', 7777], header size: 53
2022-03-22 16:22:09.515 | INFO | dcs.tests.requestHandler:run:47 - [REQUEST] crawl zhiwang
2022-03-22 16:22:09.516 | INFO | dcs.tests.spider:run:63 - crawling...
2022-03-22 16:26:55.046 | INFO | __main__:<module>:8 - reading config args...
2022-03-22 16:26:55.047 | INFO | __main__:<module>:15 - starting the server...
2022-03-22 16:26:57.162 | INFO | dcs.tests.requestHandler:run:62 - [REQUEST] report free
2022-03-22 16:26:57.162 | INFO | dcs.tests.requestHandler:run:74 - [RESPONSE] report free: success marked ['127.0.0.1', 7777], header size: 53
2022-03-22 16:27:01.858 | INFO | dcs.tests.requestHandler:run:47 - [REQUEST] crawl zhiwang
2022-03-22 16:27:01.859 | INFO | dcs.tests.spider:run:63 - crawling...

@ -52,7 +52,7 @@ class Client(Thread):
self.ip = ip self.ip = ip
self.port = port self.port = port
def test(self) -> 'int': def test(self):
with socket.socket(socket.AF_INET, socket.SOCK_STREAM, socket.IPPROTO_TCP) as socket_to_server: with socket.socket(socket.AF_INET, socket.SOCK_STREAM, socket.IPPROTO_TCP) as socket_to_server:
socket_to_server.connect((self.ip, self.port)) socket_to_server.connect((self.ip, self.port))
request = dict() request = dict()
@ -68,7 +68,7 @@ class Client(Thread):
return responseJson['test'] return responseJson['test']
def translate(self, word: str) -> 'int': def translate(self, word: str):
with socket.socket(socket.AF_INET, socket.SOCK_STREAM, socket.IPPROTO_TCP) as socket_to_server: with socket.socket(socket.AF_INET, socket.SOCK_STREAM, socket.IPPROTO_TCP) as socket_to_server:
socket_to_server.connect((self.ip, self.port)) socket_to_server.connect((self.ip, self.port))
request = dict() request = dict()
@ -85,7 +85,7 @@ class Client(Thread):
return responseJson['translate'] return responseJson['translate']
def crawling(self, word: str) -> 'int': def crawling(self, word: str):
with socket.socket(socket.AF_INET, socket.SOCK_STREAM, socket.IPPROTO_TCP) as socket_to_server: with socket.socket(socket.AF_INET, socket.SOCK_STREAM, socket.IPPROTO_TCP) as socket_to_server:
socket_to_server.connect((self.ip, self.port)) socket_to_server.connect((self.ip, self.port))
request = dict() request = dict()
@ -102,6 +102,24 @@ class Client(Thread):
return responseJson['crawl zhiwang'] return responseJson['crawl zhiwang']
def report_status(self, status: str):
# status: free or busy
with socket.socket(socket.AF_INET, socket.SOCK_STREAM, socket.IPPROTO_TCP) as socket_to_server:
socket_to_server.connect((self.ip, self.port))
request = dict()
request['action'] = 'report_' + status
request['spider_info'] = (ip, port)
full_request = generate_request(request)
socket_to_server.sendall(full_request)
responseJson = JSONDecoder().decode(
read_bytes(socket_to_server, struct.unpack('!Q', socket_to_server.recv(8))[0]).decode(
"utf-8"))
return responseJson['report_'+status]
def end(self): def end(self):
""" """
结束通信 结束通信
@ -119,8 +137,9 @@ class Client(Thread):
print("end communication!") print("end communication!")
def run(self) -> None: def run(self) -> None:
print(self.test()) print(self.report_status('free'))
print(self.crawling(input("word:"))) print(self.crawling(input("word:")))
self.report_status('free')
self.end() self.end()

@ -0,0 +1,16 @@
class global_var:
"""需要定义全局变量的放在这里,最好定义一个初始值"""
free_spiders = []
# 对于每个全局变量都需要定义get_value和set_value接口
def add_free_spider(spider_info):
global_var.free_spiders.append(spider_info)
def get_free_spiders():
return global_var.free_spiders
def delete_spider_by_id(spider_info):
global_var.free_spiders.remove(spider_info)

@ -2,6 +2,7 @@ import socket
import threading import threading
import json import json
import struct import struct
import dcs.tests.config
from loguru import logger from loguru import logger
from dcs.tests.spider import Spider from dcs.tests.spider import Spider
@ -45,8 +46,9 @@ class RequestHandler(threading.Thread):
elif self.request_map['action'] == 'crawl zhiwang': elif self.request_map['action'] == 'crawl zhiwang':
logger.info(f"[REQUEST] crawl zhiwang") logger.info(f"[REQUEST] crawl zhiwang")
spider = Spider(self.request_map['word']) spider = Spider(self.request_map['word'])
spider.run()
response = { response = {
'crawl zhiwang': spider.run() 'crawl zhiwang': 'success' # TODO
} }
response_binary = json.JSONEncoder().encode(response).encode("utf-8") response_binary = json.JSONEncoder().encode(response).encode("utf-8")
response_binary_len = len(response_binary) response_binary_len = len(response_binary)
@ -54,6 +56,22 @@ class RequestHandler(threading.Thread):
response_binary = response_binary_len_binary + response_binary response_binary = response_binary_len_binary + response_binary
self.client_socket.sendall(response_binary) self.client_socket.sendall(response_binary)
logger.info(f"[RESPONSE] crawl zhiwang: {response['crawl zhiwang']}, header size: {response_binary_len}") logger.info(
f"[RESPONSE] crawl zhiwang: {response['crawl zhiwang']}, header size: {response_binary_len}")
elif self.request_map['action'] == 'report_free':
logger.info(f"[REQUEST] report free")
if self.request_map['spider_info'] not in dcs.tests.config.get_free_spiders():
dcs.tests.config.add_free_spider(self.request_map['spider_info'])
response = {
'report_free': 'success marked ' + str(self.request_map['spider_info'])
}
response_binary = json.JSONEncoder().encode(response).encode("utf-8")
response_binary_len = len(response_binary)
response_binary_len_binary = struct.pack("!Q", response_binary_len)
response_binary = response_binary_len_binary + response_binary
self.client_socket.sendall(response_binary)
logger.info(
f"[RESPONSE] report free: {response['report_free']}, header size: {response_binary_len}")
finally: finally:
self.client_socket.close() self.client_socket.close()

@ -1,4 +1,5 @@
import threading import threading
import dcs.tests.config
from msedge.selenium_tools import Edge from msedge.selenium_tools import Edge
from msedge.selenium_tools import EdgeOptions from msedge.selenium_tools import EdgeOptions
@ -24,27 +25,41 @@ def translate(word):
return result return result
def crawl_zhiwang(word, pages_start=2, pages_end=3): def crawl_zhiwang(word, pages_start=1, pages_end=2):
edge_options = EdgeOptions() edge_options = EdgeOptions()
edge_options.use_chromium = True edge_options.use_chromium = True
edge_options.add_argument('headless') edge_options.add_argument('headless')
driver = Edge(options=edge_options, executable_path=r'G:\Users\god\PycharmProjects\dcs\bin\msedgedriver.exe') driver = Edge(options=edge_options, executable_path=r'G:\Users\god\PycharmProjects\dcs\bin\msedgedriver.exe')
soup = driver_open(driver, word) # 搜索word soup = driver_open(driver, word) # 搜索word
papers = [] # 用于保存爬取到的论文 papers = [] # 用于保存爬取到的论文
spider(driver, soup, papers) # 爬取第一篇
if pages_start == 1:
spider(driver, soup, papers)
pages_start += 1
for pn in range(pages_start, pages_end): for pn in range(pages_start, pages_end):
content = change_page(driver, pn) content = change_page(driver, pn)
spider(driver, content, papers) spider(driver, content, papers)
driver.close() driver.close()
# TODO 写入数据库
class Spider(threading.Thread): class Spider(threading.Thread):
def __init__(self, word: str): def __init__(self, word: str, pages_start=1, pages_end=1):
super().__init__() super().__init__()
self.word = word self.word = word
self.daemon = True self.daemon = True
self.pages_start = pages_start
self.pages_end = pages_end
pass pass
def distribute_spiders(self):
free_spiders = dcs.tests.config.get_free_spiders()
for sp in free_spiders:
pass
print(self.pages_start, sp)
# TODO 发布任务
def run(self) -> None: def run(self) -> None:
logger.info('crawling...') logger.info('crawling...')
crawl_zhiwang(word=self.word) self.distribute_spiders()
crawl_zhiwang(word=self.word, pages_start=self.pages_start, pages_end=self.pages_end)

Loading…
Cancel
Save