diff --git a/dcs/dcs.log b/dcs/dcs.log index fa726f1..bb98089 100644 --- a/dcs/dcs.log +++ b/dcs/dcs.log @@ -108,3 +108,31 @@ 2022-03-17 17:09:25.025 | INFO | dcs.tests.requestHandler:run:31 - [RESPONSE] test: hello TEST, header size: 22 2022-03-17 17:09:28.188 | INFO | dcs.tests.requestHandler:run:46 - [REQUEST] crawl zhiwang 2022-03-17 17:09:28.189 | INFO | dcs.tests.spider:run:49 - crawling... +2022-03-22 15:07:24.669 | INFO | __main__::8 - reading config args... +2022-03-22 15:07:24.669 | INFO | __main__::15 - starting the server... +2022-03-22 15:07:32.481 | INFO | dcs.tests.requestHandler:run:21 - [REQUEST] test +2022-03-22 15:07:32.482 | INFO | dcs.tests.requestHandler:run:31 - [RESPONSE] test: hello TEST, header size: 22 +2022-03-22 15:07:42.808 | INFO | dcs.tests.requestHandler:run:46 - [REQUEST] crawl zhiwang +2022-03-22 15:07:42.808 | INFO | dcs.tests.spider:run:49 - crawling... +2022-03-22 15:10:32.737 | INFO | __main__::8 - reading config args... +2022-03-22 15:10:32.738 | INFO | __main__::15 - starting the server... +2022-03-22 15:10:35.292 | INFO | dcs.tests.requestHandler:run:21 - [REQUEST] test +2022-03-22 15:10:35.292 | INFO | dcs.tests.requestHandler:run:31 - [RESPONSE] test: hello TEST, header size: 22 +2022-03-22 15:10:39.090 | INFO | dcs.tests.requestHandler:run:46 - [REQUEST] crawl zhiwang +2022-03-22 15:10:39.090 | INFO | dcs.tests.spider:run:53 - crawling... +2022-03-22 16:20:35.470 | INFO | __main__::8 - reading config args... +2022-03-22 16:20:35.471 | INFO | __main__::15 - starting the server... +2022-03-22 16:20:53.358 | INFO | dcs.tests.requestHandler:run:47 - [REQUEST] crawl zhiwang +2022-03-22 16:20:53.359 | INFO | dcs.tests.spider:run:63 - crawling... +2022-03-22 16:22:01.164 | INFO | __main__::8 - reading config args... +2022-03-22 16:22:01.165 | INFO | __main__::15 - starting the server... +2022-03-22 16:22:04.720 | INFO | dcs.tests.requestHandler:run:62 - [REQUEST] report free +2022-03-22 16:22:04.721 | INFO | dcs.tests.requestHandler:run:74 - [RESPONSE] report free: success marked ['127.0.0.1', 7777], header size: 53 +2022-03-22 16:22:09.515 | INFO | dcs.tests.requestHandler:run:47 - [REQUEST] crawl zhiwang +2022-03-22 16:22:09.516 | INFO | dcs.tests.spider:run:63 - crawling... +2022-03-22 16:26:55.046 | INFO | __main__::8 - reading config args... +2022-03-22 16:26:55.047 | INFO | __main__::15 - starting the server... +2022-03-22 16:26:57.162 | INFO | dcs.tests.requestHandler:run:62 - [REQUEST] report free +2022-03-22 16:26:57.162 | INFO | dcs.tests.requestHandler:run:74 - [RESPONSE] report free: success marked ['127.0.0.1', 7777], header size: 53 +2022-03-22 16:27:01.858 | INFO | dcs.tests.requestHandler:run:47 - [REQUEST] crawl zhiwang +2022-03-22 16:27:01.859 | INFO | dcs.tests.spider:run:63 - crawling... diff --git a/dcs/tests/client.py b/dcs/tests/client.py index 153f19c..033c0a9 100644 --- a/dcs/tests/client.py +++ b/dcs/tests/client.py @@ -52,7 +52,7 @@ class Client(Thread): self.ip = ip self.port = port - def test(self) -> 'int': + def test(self): with socket.socket(socket.AF_INET, socket.SOCK_STREAM, socket.IPPROTO_TCP) as socket_to_server: socket_to_server.connect((self.ip, self.port)) request = dict() @@ -68,7 +68,7 @@ class Client(Thread): return responseJson['test'] - def translate(self, word: str) -> 'int': + def translate(self, word: str): with socket.socket(socket.AF_INET, socket.SOCK_STREAM, socket.IPPROTO_TCP) as socket_to_server: socket_to_server.connect((self.ip, self.port)) request = dict() @@ -85,7 +85,7 @@ class Client(Thread): return responseJson['translate'] - def crawling(self, word: str) -> 'int': + def crawling(self, word: str): with socket.socket(socket.AF_INET, socket.SOCK_STREAM, socket.IPPROTO_TCP) as socket_to_server: socket_to_server.connect((self.ip, self.port)) request = dict() @@ -102,6 +102,24 @@ class Client(Thread): return responseJson['crawl zhiwang'] + def report_status(self, status: str): + # status: free or busy + with socket.socket(socket.AF_INET, socket.SOCK_STREAM, socket.IPPROTO_TCP) as socket_to_server: + socket_to_server.connect((self.ip, self.port)) + request = dict() + request['action'] = 'report_' + status + request['spider_info'] = (ip, port) + + full_request = generate_request(request) + + socket_to_server.sendall(full_request) + + responseJson = JSONDecoder().decode( + read_bytes(socket_to_server, struct.unpack('!Q', socket_to_server.recv(8))[0]).decode( + "utf-8")) + + return responseJson['report_'+status] + def end(self): """ 结束通信 @@ -119,8 +137,9 @@ class Client(Thread): print("end communication!") def run(self) -> None: - print(self.test()) + print(self.report_status('free')) print(self.crawling(input("word:"))) + self.report_status('free') self.end() diff --git a/dcs/tests/config.py b/dcs/tests/config.py new file mode 100644 index 0000000..184da2a --- /dev/null +++ b/dcs/tests/config.py @@ -0,0 +1,16 @@ +class global_var: + """需要定义全局变量的放在这里,最好定义一个初始值""" + free_spiders = [] + + +# 对于每个全局变量,都需要定义get_value和set_value接口 +def add_free_spider(spider_info): + global_var.free_spiders.append(spider_info) + + +def get_free_spiders(): + return global_var.free_spiders + + +def delete_spider_by_id(spider_info): + global_var.free_spiders.remove(spider_info) diff --git a/dcs/tests/requestHandler.py b/dcs/tests/requestHandler.py index 4440c44..2ce6073 100644 --- a/dcs/tests/requestHandler.py +++ b/dcs/tests/requestHandler.py @@ -2,6 +2,7 @@ import socket import threading import json import struct +import dcs.tests.config from loguru import logger from dcs.tests.spider import Spider @@ -45,8 +46,9 @@ class RequestHandler(threading.Thread): elif self.request_map['action'] == 'crawl zhiwang': logger.info(f"[REQUEST] crawl zhiwang") spider = Spider(self.request_map['word']) + spider.run() response = { - 'crawl zhiwang': spider.run() + 'crawl zhiwang': 'success' # TODO } response_binary = json.JSONEncoder().encode(response).encode("utf-8") response_binary_len = len(response_binary) @@ -54,6 +56,22 @@ class RequestHandler(threading.Thread): response_binary = response_binary_len_binary + response_binary self.client_socket.sendall(response_binary) - logger.info(f"[RESPONSE] crawl zhiwang: {response['crawl zhiwang']}, header size: {response_binary_len}") + logger.info( + f"[RESPONSE] crawl zhiwang: {response['crawl zhiwang']}, header size: {response_binary_len}") + elif self.request_map['action'] == 'report_free': + logger.info(f"[REQUEST] report free") + if self.request_map['spider_info'] not in dcs.tests.config.get_free_spiders(): + dcs.tests.config.add_free_spider(self.request_map['spider_info']) + response = { + 'report_free': 'success marked ' + str(self.request_map['spider_info']) + } + response_binary = json.JSONEncoder().encode(response).encode("utf-8") + response_binary_len = len(response_binary) + response_binary_len_binary = struct.pack("!Q", response_binary_len) + + response_binary = response_binary_len_binary + response_binary + self.client_socket.sendall(response_binary) + logger.info( + f"[RESPONSE] report free: {response['report_free']}, header size: {response_binary_len}") finally: self.client_socket.close() diff --git a/dcs/tests/spider.py b/dcs/tests/spider.py index ef2b72e..dc89b73 100644 --- a/dcs/tests/spider.py +++ b/dcs/tests/spider.py @@ -1,4 +1,5 @@ import threading +import dcs.tests.config from msedge.selenium_tools import Edge from msedge.selenium_tools import EdgeOptions @@ -24,27 +25,41 @@ def translate(word): return result -def crawl_zhiwang(word, pages_start=2, pages_end=3): +def crawl_zhiwang(word, pages_start=1, pages_end=2): edge_options = EdgeOptions() edge_options.use_chromium = True edge_options.add_argument('headless') driver = Edge(options=edge_options, executable_path=r'G:\Users\god\PycharmProjects\dcs\bin\msedgedriver.exe') soup = driver_open(driver, word) # 搜索word papers = [] # 用于保存爬取到的论文 - spider(driver, soup, papers) + # 爬取第一篇 + if pages_start == 1: + spider(driver, soup, papers) + pages_start += 1 for pn in range(pages_start, pages_end): content = change_page(driver, pn) spider(driver, content, papers) driver.close() + # TODO 写入数据库 class Spider(threading.Thread): - def __init__(self, word: str): + def __init__(self, word: str, pages_start=1, pages_end=1): super().__init__() self.word = word self.daemon = True + self.pages_start = pages_start + self.pages_end = pages_end pass + def distribute_spiders(self): + free_spiders = dcs.tests.config.get_free_spiders() + for sp in free_spiders: + pass + print(self.pages_start, sp) + # TODO 发布任务 + def run(self) -> None: logger.info('crawling...') - crawl_zhiwang(word=self.word) + self.distribute_spiders() + crawl_zhiwang(word=self.word, pages_start=self.pages_start, pages_end=self.pages_end)