Merge pull request '爬虫' (#3) from developer_wufayuan into master

developer_wufayuan
p3t2ja9zs 3 years ago
commit b51f5cbf93

@ -108,3 +108,31 @@
2022-03-17 17:09:25.025 | INFO | dcs.tests.requestHandler:run:31 - [RESPONSE] test: hello TEST, header size: 22
2022-03-17 17:09:28.188 | INFO | dcs.tests.requestHandler:run:46 - [REQUEST] crawl zhiwang
2022-03-17 17:09:28.189 | INFO | dcs.tests.spider:run:49 - crawling...
2022-03-22 15:07:24.669 | INFO | __main__:<module>:8 - reading config args...
2022-03-22 15:07:24.669 | INFO | __main__:<module>:15 - starting the server...
2022-03-22 15:07:32.481 | INFO | dcs.tests.requestHandler:run:21 - [REQUEST] test
2022-03-22 15:07:32.482 | INFO | dcs.tests.requestHandler:run:31 - [RESPONSE] test: hello TEST, header size: 22
2022-03-22 15:07:42.808 | INFO | dcs.tests.requestHandler:run:46 - [REQUEST] crawl zhiwang
2022-03-22 15:07:42.808 | INFO | dcs.tests.spider:run:49 - crawling...
2022-03-22 15:10:32.737 | INFO | __main__:<module>:8 - reading config args...
2022-03-22 15:10:32.738 | INFO | __main__:<module>:15 - starting the server...
2022-03-22 15:10:35.292 | INFO | dcs.tests.requestHandler:run:21 - [REQUEST] test
2022-03-22 15:10:35.292 | INFO | dcs.tests.requestHandler:run:31 - [RESPONSE] test: hello TEST, header size: 22
2022-03-22 15:10:39.090 | INFO | dcs.tests.requestHandler:run:46 - [REQUEST] crawl zhiwang
2022-03-22 15:10:39.090 | INFO | dcs.tests.spider:run:53 - crawling...
2022-03-22 16:20:35.470 | INFO | __main__:<module>:8 - reading config args...
2022-03-22 16:20:35.471 | INFO | __main__:<module>:15 - starting the server...
2022-03-22 16:20:53.358 | INFO | dcs.tests.requestHandler:run:47 - [REQUEST] crawl zhiwang
2022-03-22 16:20:53.359 | INFO | dcs.tests.spider:run:63 - crawling...
2022-03-22 16:22:01.164 | INFO | __main__:<module>:8 - reading config args...
2022-03-22 16:22:01.165 | INFO | __main__:<module>:15 - starting the server...
2022-03-22 16:22:04.720 | INFO | dcs.tests.requestHandler:run:62 - [REQUEST] report free
2022-03-22 16:22:04.721 | INFO | dcs.tests.requestHandler:run:74 - [RESPONSE] report free: success marked ['127.0.0.1', 7777], header size: 53
2022-03-22 16:22:09.515 | INFO | dcs.tests.requestHandler:run:47 - [REQUEST] crawl zhiwang
2022-03-22 16:22:09.516 | INFO | dcs.tests.spider:run:63 - crawling...
2022-03-22 16:26:55.046 | INFO | __main__:<module>:8 - reading config args...
2022-03-22 16:26:55.047 | INFO | __main__:<module>:15 - starting the server...
2022-03-22 16:26:57.162 | INFO | dcs.tests.requestHandler:run:62 - [REQUEST] report free
2022-03-22 16:26:57.162 | INFO | dcs.tests.requestHandler:run:74 - [RESPONSE] report free: success marked ['127.0.0.1', 7777], header size: 53
2022-03-22 16:27:01.858 | INFO | dcs.tests.requestHandler:run:47 - [REQUEST] crawl zhiwang
2022-03-22 16:27:01.859 | INFO | dcs.tests.spider:run:63 - crawling...

@ -52,7 +52,7 @@ class Client(Thread):
self.ip = ip
self.port = port
def test(self) -> 'int':
def test(self):
with socket.socket(socket.AF_INET, socket.SOCK_STREAM, socket.IPPROTO_TCP) as socket_to_server:
socket_to_server.connect((self.ip, self.port))
request = dict()
@ -68,7 +68,7 @@ class Client(Thread):
return responseJson['test']
def translate(self, word: str) -> 'int':
def translate(self, word: str):
with socket.socket(socket.AF_INET, socket.SOCK_STREAM, socket.IPPROTO_TCP) as socket_to_server:
socket_to_server.connect((self.ip, self.port))
request = dict()
@ -85,7 +85,7 @@ class Client(Thread):
return responseJson['translate']
def crawling(self, word: str) -> 'int':
def crawling(self, word: str):
with socket.socket(socket.AF_INET, socket.SOCK_STREAM, socket.IPPROTO_TCP) as socket_to_server:
socket_to_server.connect((self.ip, self.port))
request = dict()
@ -102,6 +102,24 @@ class Client(Thread):
return responseJson['crawl zhiwang']
def report_status(self, status: str):
# status: free or busy
with socket.socket(socket.AF_INET, socket.SOCK_STREAM, socket.IPPROTO_TCP) as socket_to_server:
socket_to_server.connect((self.ip, self.port))
request = dict()
request['action'] = 'report_' + status
request['spider_info'] = (ip, port)
full_request = generate_request(request)
socket_to_server.sendall(full_request)
responseJson = JSONDecoder().decode(
read_bytes(socket_to_server, struct.unpack('!Q', socket_to_server.recv(8))[0]).decode(
"utf-8"))
return responseJson['report_'+status]
def end(self):
"""
结束通信
@ -119,8 +137,9 @@ class Client(Thread):
print("end communication!")
def run(self) -> None:
print(self.test())
print(self.report_status('free'))
print(self.crawling(input("word:")))
self.report_status('free')
self.end()

@ -0,0 +1,16 @@
class global_var:
"""需要定义全局变量的放在这里,最好定义一个初始值"""
free_spiders = []
# 对于每个全局变量都需要定义get_value和set_value接口
def add_free_spider(spider_info):
global_var.free_spiders.append(spider_info)
def get_free_spiders():
return global_var.free_spiders
def delete_spider_by_id(spider_info):
global_var.free_spiders.remove(spider_info)

@ -2,6 +2,7 @@ import socket
import threading
import json
import struct
import dcs.tests.config
from loguru import logger
from dcs.tests.spider import Spider
@ -45,8 +46,9 @@ class RequestHandler(threading.Thread):
elif self.request_map['action'] == 'crawl zhiwang':
logger.info(f"[REQUEST] crawl zhiwang")
spider = Spider(self.request_map['word'])
spider.run()
response = {
'crawl zhiwang': spider.run()
'crawl zhiwang': 'success' # TODO
}
response_binary = json.JSONEncoder().encode(response).encode("utf-8")
response_binary_len = len(response_binary)
@ -54,6 +56,22 @@ class RequestHandler(threading.Thread):
response_binary = response_binary_len_binary + response_binary
self.client_socket.sendall(response_binary)
logger.info(f"[RESPONSE] crawl zhiwang: {response['crawl zhiwang']}, header size: {response_binary_len}")
logger.info(
f"[RESPONSE] crawl zhiwang: {response['crawl zhiwang']}, header size: {response_binary_len}")
elif self.request_map['action'] == 'report_free':
logger.info(f"[REQUEST] report free")
if self.request_map['spider_info'] not in dcs.tests.config.get_free_spiders():
dcs.tests.config.add_free_spider(self.request_map['spider_info'])
response = {
'report_free': 'success marked ' + str(self.request_map['spider_info'])
}
response_binary = json.JSONEncoder().encode(response).encode("utf-8")
response_binary_len = len(response_binary)
response_binary_len_binary = struct.pack("!Q", response_binary_len)
response_binary = response_binary_len_binary + response_binary
self.client_socket.sendall(response_binary)
logger.info(
f"[RESPONSE] report free: {response['report_free']}, header size: {response_binary_len}")
finally:
self.client_socket.close()

@ -1,4 +1,5 @@
import threading
import dcs.tests.config
from msedge.selenium_tools import Edge
from msedge.selenium_tools import EdgeOptions
@ -24,27 +25,41 @@ def translate(word):
return result
def crawl_zhiwang(word, pages_start=2, pages_end=3):
def crawl_zhiwang(word, pages_start=1, pages_end=2):
edge_options = EdgeOptions()
edge_options.use_chromium = True
edge_options.add_argument('headless')
driver = Edge(options=edge_options, executable_path=r'G:\Users\god\PycharmProjects\dcs\bin\msedgedriver.exe')
soup = driver_open(driver, word) # 搜索word
papers = [] # 用于保存爬取到的论文
spider(driver, soup, papers)
# 爬取第一篇
if pages_start == 1:
spider(driver, soup, papers)
pages_start += 1
for pn in range(pages_start, pages_end):
content = change_page(driver, pn)
spider(driver, content, papers)
driver.close()
# TODO 写入数据库
class Spider(threading.Thread):
def __init__(self, word: str):
def __init__(self, word: str, pages_start=1, pages_end=1):
super().__init__()
self.word = word
self.daemon = True
self.pages_start = pages_start
self.pages_end = pages_end
pass
def distribute_spiders(self):
free_spiders = dcs.tests.config.get_free_spiders()
for sp in free_spiders:
pass
print(self.pages_start, sp)
# TODO 发布任务
def run(self) -> None:
logger.info('crawling...')
crawl_zhiwang(word=self.word)
self.distribute_spiders()
crawl_zhiwang(word=self.word, pages_start=self.pages_start, pages_end=self.pages_end)

Loading…
Cancel
Save