From 656ead319e3217f21d81daf90894b4c07e526bad Mon Sep 17 00:00:00 2001 From: wufayuan <2858767122@qq.com> Date: Thu, 21 Apr 2022 20:47:01 +0800 Subject: [PATCH] =?UTF-8?q?=E5=AE=8C=E5=96=84readme?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 70 +++++++++++++++++++++++++++++++++++++++++++- dcs/dcs.log | 45 ++++++++++++++++++++++++++++ dcs/tests/zhiwang.py | 6 ++-- 3 files changed, 117 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 1ecaef7..09abf4e 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,70 @@ -# dcs +# 分布式爬虫系统 +## 下载&安装 + +### 爬虫 + +#### 安装selenium + +```bash +pip3 install selenium +``` + +#### 下载edge浏览器引擎 + +https://developer.microsoft.com/en-us/microsoft-edge/tools/webdriver/ + +![img](https://img-blog.csdnimg.cn/20201014171452760.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3RrMTAyMw==,size_16,color_FFFFFF,t_70) + +浏览器 --> 设置 --> 关于 Microsoft Edge --> 版本信息。和上面对应(浏览器图标也要对应上,是这个带 绿色 的) + +![img](https://img-blog.csdnimg.cn/20201014171642418.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3RrMTAyMw==,size_16,color_FFFFFF,t_70) + +把下载的浏览器引擎程序放在 dcs/bin 目录下 + +可以用下面的脚本测试 + +```python +from time import sleep +from selenium import webdriver + +driverfile_path = r'G:\Users\god\PycharmProjects\dcs\bin\msedgedriver.exe' +driver = webdriver.Edge(executable_path=driverfile_path) + +driver.get(r'https://www.baidu.com/') + +sleep(5) +driver.close() +``` + +上面的路径需要自己对应改一下 + +## 运行 + +python3 运行 main.py 文件,开启 server、spider、user_process、requester、communicate 五个服务线程,分布式爬虫系统服务端开始运行和监测。 + +node 运行 login.js,即可开启web服务器,可接收浏览器请求,之后与爬虫服务器通信,取得结果后返回浏览器。 + +再运行 client.py 文件,运行客户端,开始请求爬虫任务,服务端即可接收、分配并执行、组合,最终返回结果到客户端。 + +## 运行截图 + +![image-20220421204241089](X:\Users\god\AppData\Roaming\Typora\typora-user-images\image-20220421204241089.png) + +![image-20220421204341598](X:\Users\god\AppData\Roaming\Typora\typora-user-images\image-20220421204341598.png) + +![image-20220421204402347](X:\Users\god\AppData\Roaming\Typora\typora-user-images\image-20220421204402347.png) + +## 项目结构图 + +> [未命名文件(2) (educoder.net)](https://bdgit.educoder.net/api/v1/repos/p3t2ja9zs/dcs/raw/docs/pictures/项目结构图.pdf?ref=master&access_token=96cd287dc28356421b6d6033d26cdeb8648df583) + +## 服务器运行日志 + +> https://code.educoder.net/attachments/entries/get_file?download_url=https://code.educoder.net/api/p3t2ja9zs/dcs/raw?filepath=dcs/dcs.log&ref=master + +## 更新日志 + +## V1.0 + +基本框架搭建完毕,实现核心的类“P2P”机制 \ No newline at end of file diff --git a/dcs/dcs.log b/dcs/dcs.log index 7623ad8..2f127a0 100644 --- a/dcs/dcs.log +++ b/dcs/dcs.log @@ -6756,3 +6756,48 @@ 2022-04-21 17:27:52.938 | DEBUG | dcs.tests.spider_task:compose_result:192 - {1: {'name': 'test', 'college': 'test', 'major': 'test', 'paper': 'test'}, 2: {'name': 'test', 'college': 'test', 'major': 'test', 'paper': 'test'}, 3: {'name': 'test', 'college': 'test', 'major': 'test', 'paper': 'test'}, 4: {'name': 'test', 'college': 'test', 'major': 'test', 'paper': 'test'}, 5: {'name': 'test', 'college': 'test', 'major': 'test', 'paper': 'test'}, 6: {'name': 'test', 'college': 'test', 'major': 'test', 'paper': 'test'}} 2022-04-21 17:27:53.082 | DEBUG | dcs.tests.spider_task:compose_result:192 - {1: {'name': 'test', 'college': 'test', 'major': 'test', 'paper': 'test'}, 2: {'name': 'test', 'college': 'test', 'major': 'test', 'paper': 'test'}, 3: {'name': 'test', 'college': 'test', 'major': 'test', 'paper': 'test'}, 4: {'name': 'test', 'college': 'test', 'major': 'test', 'paper': 'test'}, 5: {'name': 'test', 'college': 'test', 'major': 'test', 'paper': 'test'}, 6: {'name': 'test', 'college': 'test', 'major': 'test', 'paper': 'test'}} 2022-04-21 17:27:53.088 | INFO | dcs.communicate:run:33 - sending response to ('127.0.0.1', 9000): {'0': {'name': 'remote', 'college': 'remote', 'major': 'remote', 'paper': 'remote'}, 1: {'name': 'test', 'college': 'test', 'major': 'test', 'paper': 'test'}, 2: {'name': 'test', 'college': 'test', 'major': 'test', 'paper': 'test'}, 3: {'name': 'test', 'college': 'test', 'major': 'test', 'paper': 'test'}, 4: {'name': 'test', 'college': 'test', 'major': 'test', 'paper': 'test'}, 5: {'name': 'test', 'college': 'test', 'major': 'test', 'paper': 'test'}, 6: {'name': 'test', 'college': 'test', 'major': 'test', 'paper': 'test'}, 'type': 'response'} +2022-04-21 20:42:26.366 | DEBUG | __main__::14 - reading config args... +2022-04-21 20:42:26.367 | DEBUG | __main__::21 - starting the main server... +2022-04-21 20:42:26.367 | DEBUG | __main__::26 - starting the requester server... +2022-04-21 20:42:26.368 | DEBUG | __main__::31 - starting the spider server... +2022-04-21 20:42:26.374 | DEBUG | __main__::36 - starting the user server... +2022-04-21 20:42:26.398 | DEBUG | __main__::41 - starting the communicator server... +2022-04-21 20:42:56.466 | DEBUG | dcs.server:run:25 - connected to client ('127.0.0.1', 9000) +2022-04-21 20:42:56.496 | INFO | dcs.user_process:run:18 - processing user request... +2022-04-21 20:42:56.496 | INFO | dcs.tests.user_request_handler:register:37 - [REQUEST] register +2022-04-21 20:42:56.820 | INFO | dcs.tests.user_request_handler:register:45 - [RESPONSE] register: 用户名已存在,注册失败 +2022-04-21 20:42:56.832 | INFO | dcs.communicate:run:26 - sending response to ('127.0.0.1', 9000): {'register': '用户名已存在,注册失败'} +2022-04-21 20:42:56.856 | INFO | dcs.user_process:run:18 - processing user request... +2022-04-21 20:42:56.874 | INFO | dcs.tests.user_request_handler:login:26 - [REQUEST] login +2022-04-21 20:42:57.404 | INFO | dcs.tests.user_request_handler:login:34 - [RESPONSE] cookie: e1e6455f7b01ba14326145dd8f96e6b8c365ac1b +2022-04-21 20:42:57.428 | INFO | dcs.communicate:run:26 - sending response to ('127.0.0.1', 9000): {'cookie': 'e1e6455f7b01ba14326145dd8f96e6b8c365ac1b'} +2022-04-21 20:42:57.471 | INFO | dcs.user_process:run:18 - processing user request... +2022-04-21 20:42:57.483 | INFO | dcs.tests.user_request_handler:report_state:16 - [REQUEST] report free +2022-04-21 20:42:57.483 | INFO | dcs.tests.user_request_handler:report_state:23 - [RESPONSE] report free: success marked e1e6455f7b01ba14326145dd8f96e6b8c365ac1b +2022-04-21 20:42:57.506 | INFO | dcs.communicate:run:26 - sending response to ('127.0.0.1', 9000): {'report_free': 'success marked e1e6455f7b01ba14326145dd8f96e6b8c365ac1b'} +2022-04-21 20:42:57.530 | INFO | dcs.tests.spider_task:distribute_task:154 - distributing task: (('127.0.0.1', 9000), {'action': 'crawl zhiwang', 'word': 'computer', 'pages_start': 1, 'pages_end': 10, 'cookie': 'e1e6455f7b01ba14326145dd8f96e6b8c365ac1b'}) +2022-04-21 20:42:57.530 | DEBUG | dcs.tests.spider_task:distribute_task:167 - [, , ] +2022-04-21 20:42:57.530 | DEBUG | dcs.tests.spider_task:run:203 - [] +2022-04-21 20:42:57.531 | DEBUG | dcs.tests.spider_task:run:215 - generating remote task +2022-04-21 20:42:57.531 | INFO | dcs.requester:get:44 - sending crawl request to ('127.0.0.1', 9000) +2022-04-21 20:42:57.545 | DEBUG | dcs.tests.spider_task:run:223 - generating local task +2022-04-21 20:42:57.557 | INFO | dcs.communicate:run:26 - sending response to ('127.0.0.1', 9000): {'crawling state': 'starting, please wait...'} +2022-04-21 20:42:57.563 | INFO | dcs.tests.requestHandler:run:20 - [REQUEST] end +2022-04-21 20:42:57.569 | DEBUG | dcs.requester:run:70 - receiving remote task result, saving... +2022-04-21 20:42:57.569 | DEBUG | dcs.tests.spider_task:run:223 - generating local task +2022-04-21 20:42:57.599 | DEBUG | dcs.tests.requestHandler:run:21 - communication over from ('127.0.0.1', 9000)! +2022-04-21 20:42:57.641 | DEBUG | dcs.requester:run:74 - result: {'0': {'name': 'remote', 'college': 'remote', 'major': 'remote', 'paper': 'remote'}} +2022-04-21 20:42:58.163 | DEBUG | dcs.tests.spider_task:test_simulation:102 - simulation crawling... +2022-04-21 20:42:58.163 | INFO | dcs.tests.spider_task:write2database:35 - writing to database: test +2022-04-21 20:42:58.176 | DEBUG | dcs.tests.spider_task:test_simulation:102 - simulation crawling... +2022-04-21 20:42:58.176 | INFO | dcs.tests.spider_task:write2database:35 - writing to database: test +2022-04-21 20:42:58.353 | INFO | dcs.tests.spider_task:write2database:35 - writing to database: test +2022-04-21 20:42:58.394 | INFO | dcs.tests.spider_task:write2database:35 - writing to database: test +2022-04-21 20:42:58.690 | INFO | dcs.tests.spider_task:write2database:35 - writing to database: test +2022-04-21 20:42:58.726 | INFO | dcs.tests.spider_task:write2database:35 - writing to database: test +2022-04-21 20:42:58.978 | INFO | dcs.tests.spider_task:run:118 - partial crawl task finished: (('127.0.0.1', 9000), {'action': 'crawl zhiwang', 'word': 'computer', 'pages_start': 7, 'pages_end': 10, 'cookie': 'e1e6455f7b01ba14326145dd8f96e6b8c365ac1b'}) +2022-04-21 20:42:59.002 | INFO | dcs.tests.spider_task:run:118 - partial crawl task finished: (('127.0.0.1', 9000), {'action': 'crawl zhiwang', 'word': 'computer', 'pages_start': 4, 'pages_end': 7, 'cookie': 'e1e6455f7b01ba14326145dd8f96e6b8c365ac1b'}) +2022-04-21 20:42:59.038 | DEBUG | dcs.tests.spider_task:compose_result:177 - composing task... +2022-04-21 20:42:59.230 | DEBUG | dcs.tests.spider_task:compose_result:192 - {1: {'name': 'test', 'college': 'test', 'major': 'test', 'paper': 'test'}, 2: {'name': 'test', 'college': 'test', 'major': 'test', 'paper': 'test'}, 3: {'name': 'test', 'college': 'test', 'major': 'test', 'paper': 'test'}, 4: {'name': 'test', 'college': 'test', 'major': 'test', 'paper': 'test'}, 5: {'name': 'test', 'college': 'test', 'major': 'test', 'paper': 'test'}, 6: {'name': 'test', 'college': 'test', 'major': 'test', 'paper': 'test'}} +2022-04-21 20:42:59.476 | DEBUG | dcs.tests.spider_task:compose_result:192 - {1: {'name': 'test', 'college': 'test', 'major': 'test', 'paper': 'test'}, 2: {'name': 'test', 'college': 'test', 'major': 'test', 'paper': 'test'}, 3: {'name': 'test', 'college': 'test', 'major': 'test', 'paper': 'test'}, 4: {'name': 'test', 'college': 'test', 'major': 'test', 'paper': 'test'}, 5: {'name': 'test', 'college': 'test', 'major': 'test', 'paper': 'test'}, 6: {'name': 'test', 'college': 'test', 'major': 'test', 'paper': 'test'}} +2022-04-21 20:42:59.497 | INFO | dcs.communicate:run:33 - sending response to ('127.0.0.1', 9000): {'0': {'name': 'remote', 'college': 'remote', 'major': 'remote', 'paper': 'remote'}, 1: {'name': 'test', 'college': 'test', 'major': 'test', 'paper': 'test'}, 2: {'name': 'test', 'college': 'test', 'major': 'test', 'paper': 'test'}, 3: {'name': 'test', 'college': 'test', 'major': 'test', 'paper': 'test'}, 4: {'name': 'test', 'college': 'test', 'major': 'test', 'paper': 'test'}, 5: {'name': 'test', 'college': 'test', 'major': 'test', 'paper': 'test'}, 6: {'name': 'test', 'college': 'test', 'major': 'test', 'paper': 'test'}, 'type': 'response'} diff --git a/dcs/tests/zhiwang.py b/dcs/tests/zhiwang.py index 83a7226..b78117e 100644 --- a/dcs/tests/zhiwang.py +++ b/dcs/tests/zhiwang.py @@ -16,9 +16,8 @@ class Paper: def __str__(self): return f'{self.title}, authors' - # 定义作者类 - +# 定义作者类 class Author: def __init__(self, name, college, major): self.name = name @@ -34,7 +33,8 @@ def driver_open(driver, key_word): driver.find_element(by=By.CSS_SELECTOR, value='#txt_SearchText').send_keys(key_word) time.sleep(2) # 点击搜索按钮 - driver.find_element(by=By.CSS_SELECTOR, value='body > div.wrapper.section1 > div.searchmain > div > div.input-box > input.search-btn').click() + driver.find_element(by=By.CSS_SELECTOR, + value='body > div.wrapper.section1 > div.searchmain > div > div.input-box > input.search-btn').click() time.sleep(5) content = driver.page_source.encode('utf-8') # driver.close()