forked from p3t2ja9zs/dcs
parent
b681c1b92d
commit
2f4fa14b2b
@ -0,0 +1,136 @@
|
||||
import json
|
||||
import socket
|
||||
import struct
|
||||
import threading
|
||||
from configparser import ConfigParser
|
||||
from json import JSONDecoder
|
||||
|
||||
from msedge.selenium_tools import Edge
|
||||
from msedge.selenium_tools import EdgeOptions
|
||||
|
||||
from dcs.tests.zhiwang import *
|
||||
from dcs.tools import message_process as mp
|
||||
from dcs.tools.message_process import parse_request, generate_response
|
||||
|
||||
|
||||
def crawl_zhiwang(word, pages_start, pages_end):
|
||||
edge_options = EdgeOptions()
|
||||
edge_options.use_chromium = True
|
||||
No_Image_loading = {"profile.managed_default_content_settings.images": 2, 'permissions.default.stylesheet': 2}
|
||||
edge_options.add_experimental_option("prefs", No_Image_loading)
|
||||
edge_options.add_argument('--headless')
|
||||
configFile = '../../conf/settings.ini'
|
||||
con = ConfigParser()
|
||||
con.read(configFile, encoding='utf-8')
|
||||
items = con.items('crawler')
|
||||
items = dict(items)['edge_driver_path']
|
||||
print(items)
|
||||
driver = Edge(options=edge_options, executable_path=items)
|
||||
|
||||
soup = driver_open(driver, word)
|
||||
papers = [] # 用于保存爬取到的论文
|
||||
paper_id = 0
|
||||
res = {} # 保存终端爬取结果
|
||||
|
||||
# 爬取第一篇
|
||||
if pages_start == 1:
|
||||
spider(driver, soup, papers)
|
||||
logger.debug(res)
|
||||
pages_start += 1
|
||||
while paper_id < len(papers):
|
||||
write2res(papers[paper_id], res)
|
||||
paper_id += 1
|
||||
|
||||
while pages_start < pages_end:
|
||||
content = change_page(driver, pages_start)
|
||||
spider(driver, content, papers)
|
||||
while paper_id < len(papers):
|
||||
write2res(papers[paper_id], res)
|
||||
paper_id += 1
|
||||
pages_start += 1
|
||||
driver.close()
|
||||
# logger.debug("here")
|
||||
return res
|
||||
|
||||
|
||||
def write2res(paper: Paper, res):
|
||||
for author in paper.authors:
|
||||
if author.name:
|
||||
res.update(
|
||||
{len(res): {'name': author.name, 'college': author.college, 'major': author.major,
|
||||
'title': paper.title}})
|
||||
|
||||
|
||||
class Crawl(threading.Thread):
|
||||
def __init__(self):
|
||||
super(Crawl, self).__init__()
|
||||
self.server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM, socket.IPPROTO_TCP)
|
||||
self.server_socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
|
||||
self.server_socket.bind(('', local_port))
|
||||
|
||||
@staticmethod
|
||||
def crawl(request_map) -> dict:
|
||||
result_map = crawl_zhiwang(request_map['word'], request_map['pages_start'], request_map['pages_end'])
|
||||
# result_map = {0: {'name': 'remote', 'college': 'remote', 'major': 'remote', 'title': 'remote'},
|
||||
# 1: {'name': 'remote1', 'college': 'remote1', 'major': 'remote', 'title': 'remote'}}
|
||||
logger.debug(result_map)
|
||||
return result_map
|
||||
|
||||
def run(self) -> None:
|
||||
self.server_socket.listen()
|
||||
while True:
|
||||
client_socket, _ = self.server_socket.accept()
|
||||
request_map = parse_request(client_socket)
|
||||
if request_map['type'] == 'request':
|
||||
print("receiving help request:\n" + json.dumps(request_map, ensure_ascii=False))
|
||||
response_map = self.crawl(request_map)
|
||||
response_map.update({'cookie': request_map['cookie']})
|
||||
client_socket.sendall(generate_response(response_map))
|
||||
|
||||
report_map = {'action': 'report_free', 'cookie': cookie}
|
||||
logger.debug(send_request(socket_to_server, report_map))
|
||||
if request_map['type'] == 'response':
|
||||
print("receiving response:\n" + json.dumps(request_map, ensure_ascii=False))
|
||||
# break
|
||||
|
||||
|
||||
def send_request(socket2server, req):
|
||||
socket2server.sendall(mp.generate_request(req))
|
||||
responseJson = JSONDecoder().decode(
|
||||
mp.read_bytes(socket2server, struct.unpack('!Q', socket2server.recv(8))[0]).decode(
|
||||
"utf-8"))
|
||||
return responseJson
|
||||
|
||||
|
||||
server_ip = '127.0.0.1'
|
||||
server_port = 7777
|
||||
local_port = 9999
|
||||
crawl = Crawl()
|
||||
crawl.start()
|
||||
# res = crawl.crawl({'action': 'crawl zhiwang', 'word': 'science', 'pages_start': 1, 'pages_end': 2, 'cookie': '123'})
|
||||
# logger.debug(res)
|
||||
|
||||
socket_to_server = socket.socket(socket.AF_INET, socket.SOCK_STREAM, socket.IPPROTO_TCP)
|
||||
socket_to_server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
|
||||
socket_to_server.bind(('', local_port))
|
||||
socket_to_server.connect((server_ip, server_port))
|
||||
|
||||
request = {'action': 'register', 'user': 'liuxiaoyu', 'password': '113818'}
|
||||
logger.debug(send_request(socket_to_server, request))
|
||||
|
||||
request = {'action': 'login', 'user': 'liuxiaoyu', 'password': '113818'}
|
||||
response = send_request(socket_to_server, request)
|
||||
logger.debug(response)
|
||||
cookie = response['cookie']
|
||||
|
||||
request = {'action': 'report_free', 'cookie': cookie}
|
||||
logger.debug(send_request(socket_to_server, request))
|
||||
|
||||
# request = {'action': 'crawl zhiwang', 'word': 'science', 'pages_start': 1, 'pages_end': 3,
|
||||
# 'cookie': cookie}
|
||||
# logger.debug(send_request(socket_to_server, request))
|
||||
|
||||
request = {'action': 'end'}
|
||||
socket_to_server.sendall(mp.generate_request(request))
|
||||
|
||||
crawl.join()
|
File diff suppressed because one or more lines are too long
@ -1 +1 @@
|
||||
python .\connect.py --ip 127.0.0.1 --port 7777 login --user yuu --password yuu
|
||||
python .\connect.py --ip 127.0.0.1 --port 7777 login --user liuxiaoyu --password 113818
|
||||
|
Loading…
Reference in new issue