diff --git a/bin/msedgedriver.exe b/bin/msedgedriver.exe new file mode 100644 index 0000000..0ed1012 Binary files /dev/null and b/bin/msedgedriver.exe differ diff --git a/dcs/dcs.log b/dcs/dcs.log index 35fee5b..fa726f1 100644 --- a/dcs/dcs.log +++ b/dcs/dcs.log @@ -22,3 +22,89 @@ 2022-03-16 15:56:17.146 | INFO | dcs.tests.server:run:36 - [REQUEST] end 2022-03-16 15:56:17.147 | WARNING | dcs.tests.server:run:37 - communication over! 2022-03-16 15:56:17.149 | WARNING | __main__::21 - Overing... +2022-03-17 16:33:41.255 | INFO | __main__::8 - reading config args... +2022-03-17 16:33:41.256 | INFO | __main__::15 - starting the server... +2022-03-17 16:33:49.912 | INFO | dcs.tests.requestHandler:run:21 - [REQUEST] test +2022-03-17 16:33:49.913 | INFO | dcs.tests.requestHandler:run:31 - [RESPONSE] test: hello TEST, header size: 22 +2022-03-17 16:33:57.605 | INFO | dcs.tests.requestHandler:run:33 - [REQUEST] translate +2022-03-17 16:33:57.606 | INFO | dcs.tests.spider:run:45 - crawling... +2022-03-17 16:38:20.919 | INFO | __main__::8 - reading config args... +2022-03-17 16:38:20.920 | INFO | __main__::15 - starting the server... +2022-03-17 16:38:26.531 | INFO | __main__::8 - reading config args... +2022-03-17 16:38:26.531 | INFO | __main__::15 - starting the server... +2022-03-17 16:38:29.286 | INFO | dcs.tests.requestHandler:run:21 - [REQUEST] test +2022-03-17 16:38:29.287 | INFO | dcs.tests.requestHandler:run:31 - [RESPONSE] test: hello TEST, header size: 22 +2022-03-17 16:38:33.326 | INFO | dcs.tests.requestHandler:run:46 - [REQUEST] crawl zhiwang +2022-03-17 16:38:33.327 | INFO | dcs.tests.spider:run:45 - crawling... +2022-03-17 16:41:03.903 | INFO | __main__::8 - reading config args... +2022-03-17 16:41:03.903 | INFO | __main__::15 - starting the server... +2022-03-17 16:41:06.026 | INFO | dcs.tests.requestHandler:run:21 - [REQUEST] test +2022-03-17 16:41:06.027 | INFO | dcs.tests.requestHandler:run:31 - [RESPONSE] test: hello TEST, header size: 22 +2022-03-17 16:41:09.812 | INFO | dcs.tests.requestHandler:run:46 - [REQUEST] crawl zhiwang +2022-03-17 16:41:09.812 | INFO | dcs.tests.spider:run:49 - crawling... +2022-03-17 16:43:18.535 | INFO | __main__::8 - reading config args... +2022-03-17 16:43:18.535 | INFO | __main__::15 - starting the server... +2022-03-17 16:43:22.518 | INFO | dcs.tests.requestHandler:run:21 - [REQUEST] test +2022-03-17 16:43:22.518 | INFO | dcs.tests.requestHandler:run:31 - [RESPONSE] test: hello TEST, header size: 22 +2022-03-17 16:43:26.977 | INFO | dcs.tests.requestHandler:run:46 - [REQUEST] crawl zhiwang +2022-03-17 16:43:26.977 | INFO | dcs.tests.spider:run:49 - crawling... +2022-03-17 16:44:01.823 | INFO | __main__::8 - reading config args... +2022-03-17 16:44:01.824 | INFO | __main__::15 - starting the server... +2022-03-17 16:44:04.256 | INFO | dcs.tests.requestHandler:run:21 - [REQUEST] test +2022-03-17 16:44:04.257 | INFO | dcs.tests.requestHandler:run:31 - [RESPONSE] test: hello TEST, header size: 22 +2022-03-17 16:44:08.634 | INFO | dcs.tests.requestHandler:run:46 - [REQUEST] crawl zhiwang +2022-03-17 16:44:08.634 | INFO | dcs.tests.spider:run:49 - crawling... +2022-03-17 16:44:55.914 | INFO | __main__::8 - reading config args... +2022-03-17 16:44:55.914 | INFO | __main__::15 - starting the server... +2022-03-17 16:44:57.966 | INFO | dcs.tests.requestHandler:run:21 - [REQUEST] test +2022-03-17 16:44:57.967 | INFO | dcs.tests.requestHandler:run:31 - [RESPONSE] test: hello TEST, header size: 22 +2022-03-17 16:45:01.569 | INFO | dcs.tests.requestHandler:run:46 - [REQUEST] crawl zhiwang +2022-03-17 16:45:01.570 | INFO | dcs.tests.spider:run:49 - crawling... +2022-03-17 16:45:59.893 | INFO | __main__::8 - reading config args... +2022-03-17 16:45:59.894 | INFO | __main__::15 - starting the server... +2022-03-17 16:46:03.158 | INFO | dcs.tests.requestHandler:run:21 - [REQUEST] test +2022-03-17 16:46:03.158 | INFO | dcs.tests.requestHandler:run:31 - [RESPONSE] test: hello TEST, header size: 22 +2022-03-17 16:46:06.376 | INFO | dcs.tests.requestHandler:run:46 - [REQUEST] crawl zhiwang +2022-03-17 16:46:06.377 | INFO | dcs.tests.spider:run:49 - crawling... +2022-03-17 16:48:25.032 | INFO | __main__::8 - reading config args... +2022-03-17 16:48:25.033 | INFO | __main__::15 - starting the server... +2022-03-17 16:48:26.903 | INFO | dcs.tests.requestHandler:run:21 - [REQUEST] test +2022-03-17 16:48:26.904 | INFO | dcs.tests.requestHandler:run:31 - [RESPONSE] test: hello TEST, header size: 22 +2022-03-17 16:48:29.504 | INFO | dcs.tests.requestHandler:run:46 - [REQUEST] crawl zhiwang +2022-03-17 16:48:29.505 | INFO | dcs.tests.spider:run:49 - crawling... +2022-03-17 16:50:28.903 | INFO | __main__::8 - reading config args... +2022-03-17 16:50:28.905 | INFO | __main__::15 - starting the server... +2022-03-17 16:50:30.975 | INFO | dcs.tests.requestHandler:run:21 - [REQUEST] test +2022-03-17 16:50:30.975 | INFO | dcs.tests.requestHandler:run:31 - [RESPONSE] test: hello TEST, header size: 22 +2022-03-17 16:50:33.959 | INFO | dcs.tests.requestHandler:run:46 - [REQUEST] crawl zhiwang +2022-03-17 16:50:33.960 | INFO | dcs.tests.spider:run:49 - crawling... +2022-03-17 16:56:08.569 | INFO | __main__::8 - reading config args... +2022-03-17 16:56:08.569 | INFO | __main__::15 - starting the server... +2022-03-17 16:56:19.722 | INFO | dcs.tests.requestHandler:run:21 - [REQUEST] test +2022-03-17 16:56:19.722 | INFO | dcs.tests.requestHandler:run:31 - [RESPONSE] test: hello TEST, header size: 22 +2022-03-17 16:56:21.588 | INFO | dcs.tests.requestHandler:run:46 - [REQUEST] crawl zhiwang +2022-03-17 16:56:21.589 | INFO | dcs.tests.spider:run:48 - crawling... +2022-03-17 16:58:01.057 | INFO | __main__::8 - reading config args... +2022-03-17 16:58:01.058 | INFO | __main__::15 - starting the server... +2022-03-17 16:58:04.142 | INFO | dcs.tests.requestHandler:run:21 - [REQUEST] test +2022-03-17 16:58:04.143 | INFO | dcs.tests.requestHandler:run:31 - [RESPONSE] test: hello TEST, header size: 22 +2022-03-17 16:58:09.346 | INFO | dcs.tests.requestHandler:run:46 - [REQUEST] crawl zhiwang +2022-03-17 16:58:09.347 | INFO | dcs.tests.spider:run:48 - crawling... +2022-03-17 17:06:58.479 | INFO | __main__::8 - reading config args... +2022-03-17 17:06:58.480 | INFO | __main__::15 - starting the server... +2022-03-17 17:07:02.340 | INFO | dcs.tests.requestHandler:run:21 - [REQUEST] test +2022-03-17 17:07:02.341 | INFO | dcs.tests.requestHandler:run:31 - [RESPONSE] test: hello TEST, header size: 22 +2022-03-17 17:07:06.076 | INFO | dcs.tests.requestHandler:run:46 - [REQUEST] crawl zhiwang +2022-03-17 17:07:06.076 | INFO | dcs.tests.spider:run:49 - crawling... +2022-03-17 17:08:39.065 | INFO | __main__::8 - reading config args... +2022-03-17 17:08:39.066 | INFO | __main__::15 - starting the server... +2022-03-17 17:08:42.427 | INFO | dcs.tests.requestHandler:run:21 - [REQUEST] test +2022-03-17 17:08:42.428 | INFO | dcs.tests.requestHandler:run:31 - [RESPONSE] test: hello TEST, header size: 22 +2022-03-17 17:08:45.521 | INFO | dcs.tests.requestHandler:run:46 - [REQUEST] crawl zhiwang +2022-03-17 17:08:45.522 | INFO | dcs.tests.spider:run:49 - crawling... +2022-03-17 17:09:23.242 | INFO | __main__::8 - reading config args... +2022-03-17 17:09:23.243 | INFO | __main__::15 - starting the server... +2022-03-17 17:09:25.024 | INFO | dcs.tests.requestHandler:run:21 - [REQUEST] test +2022-03-17 17:09:25.025 | INFO | dcs.tests.requestHandler:run:31 - [RESPONSE] test: hello TEST, header size: 22 +2022-03-17 17:09:28.188 | INFO | dcs.tests.requestHandler:run:46 - [REQUEST] crawl zhiwang +2022-03-17 17:09:28.189 | INFO | dcs.tests.spider:run:49 - crawling... diff --git a/dcs/tests/client.py b/dcs/tests/client.py index 81b66a9..153f19c 100644 --- a/dcs/tests/client.py +++ b/dcs/tests/client.py @@ -85,6 +85,23 @@ class Client(Thread): return responseJson['translate'] + def crawling(self, word: str) -> 'int': + with socket.socket(socket.AF_INET, socket.SOCK_STREAM, socket.IPPROTO_TCP) as socket_to_server: + socket_to_server.connect((self.ip, self.port)) + request = dict() + request['action'] = 'crawl zhiwang' + request['word'] = word + + full_request = generate_request(request) + + socket_to_server.sendall(full_request) + + responseJson = JSONDecoder().decode( + read_bytes(socket_to_server, struct.unpack('!Q', socket_to_server.recv(8))[0]).decode( + "utf-8")) + + return responseJson['crawl zhiwang'] + def end(self): """ 结束通信 @@ -103,7 +120,7 @@ class Client(Thread): def run(self) -> None: print(self.test()) - print(self.translate(input("word:"))) + print(self.crawling(input("word:"))) self.end() diff --git a/dcs/tests/requestHandler.py b/dcs/tests/requestHandler.py index 4b85de9..4440c44 100644 --- a/dcs/tests/requestHandler.py +++ b/dcs/tests/requestHandler.py @@ -42,5 +42,18 @@ class RequestHandler(threading.Thread): response_binary = response_binary_len_binary + response_binary self.client_socket.sendall(response_binary) logger.info(f"[RESPONSE] translate: {response['translate']}, header size: {response_binary_len}") + elif self.request_map['action'] == 'crawl zhiwang': + logger.info(f"[REQUEST] crawl zhiwang") + spider = Spider(self.request_map['word']) + response = { + 'crawl zhiwang': spider.run() + } + response_binary = json.JSONEncoder().encode(response).encode("utf-8") + response_binary_len = len(response_binary) + response_binary_len_binary = struct.pack("!Q", response_binary_len) + + response_binary = response_binary_len_binary + response_binary + self.client_socket.sendall(response_binary) + logger.info(f"[RESPONSE] crawl zhiwang: {response['crawl zhiwang']}, header size: {response_binary_len}") finally: self.client_socket.close() diff --git a/dcs/tests/spider.py b/dcs/tests/spider.py index 7a41aa8..ef2b72e 100644 --- a/dcs/tests/spider.py +++ b/dcs/tests/spider.py @@ -1,9 +1,12 @@ import threading -import requests +from msedge.selenium_tools import Edge +from msedge.selenium_tools import EdgeOptions + +from dcs.tests.zhiwang import * from loguru import logger -def crawl(word): +def translate(word): url = 'http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule' data = {'i': word, 'from': 'AUTO', @@ -21,8 +24,22 @@ def crawl(word): return result +def crawl_zhiwang(word, pages_start=2, pages_end=3): + edge_options = EdgeOptions() + edge_options.use_chromium = True + edge_options.add_argument('headless') + driver = Edge(options=edge_options, executable_path=r'G:\Users\god\PycharmProjects\dcs\bin\msedgedriver.exe') + soup = driver_open(driver, word) # 搜索word + papers = [] # 用于保存爬取到的论文 + spider(driver, soup, papers) + for pn in range(pages_start, pages_end): + content = change_page(driver, pn) + spider(driver, content, papers) + driver.close() + + class Spider(threading.Thread): - def __init__(self, word: 'string'): + def __init__(self, word: str): super().__init__() self.word = word self.daemon = True @@ -30,5 +47,4 @@ class Spider(threading.Thread): def run(self) -> None: logger.info('crawling...') - return crawl(self.word) - + crawl_zhiwang(word=self.word) diff --git a/dcs/tests/zhiwang.py b/dcs/tests/zhiwang.py index f3d6034..f833d43 100644 --- a/dcs/tests/zhiwang.py +++ b/dcs/tests/zhiwang.py @@ -3,11 +3,8 @@ ''' from bs4 import BeautifulSoup -from selenium import webdriver -from selenium.webdriver.chrome.service import Service import time import requests -import csv # 定义论文类 @@ -47,7 +44,9 @@ def driver_open(driver, key_word): def spider(driver, soup, papers): tbody = soup.find_all('tbody') - tbody = BeautifulSoup(str(tbody[0]), 'lxml') + try: + tbody = BeautifulSoup(str(tbody[0]), 'lxml') + except:return tr = tbody.find_all('tr') for item in tr: tr_bf = BeautifulSoup(str(item), 'lxml') @@ -121,6 +120,7 @@ def get_author_info(skey, code): if __name__ == '__main__': + ''' browser = Service('../../bin/msedgedriver.exe') driver = webdriver.Edge(service=browser) soup = driver_open(driver, '知识图谱') # 搜索知识图谱 @@ -147,3 +147,4 @@ if __name__ == '__main__': # 关闭文件 f_papers_authors.close() + ''' diff --git a/requirements.txt b/requirements.txt index 0295efd..738122f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,6 @@ loguru~=0.6.0 -requests~=2.27.1 \ No newline at end of file +requests~=2.27.1 +pandas~=1.3.4 +bs4~=0.0.1 +beautifulsoup4~=4.10.0 +selenium~=4.1.3 \ No newline at end of file