|
|
|
@ -1,23 +1,71 @@
|
|
|
|
|
import json
|
|
|
|
|
import socket
|
|
|
|
|
import struct
|
|
|
|
|
from json import JSONDecoder
|
|
|
|
|
import threading
|
|
|
|
|
from json import JSONDecoder
|
|
|
|
|
from time import sleep
|
|
|
|
|
|
|
|
|
|
from loguru import logger
|
|
|
|
|
from msedge.selenium_tools import Edge
|
|
|
|
|
from msedge.selenium_tools import EdgeOptions
|
|
|
|
|
|
|
|
|
|
from dcs.tests.zhiwang import *
|
|
|
|
|
from dcs.tools import message_process as mp
|
|
|
|
|
from dcs.tools.message_process import parse_request, generate_response
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def crawl_zhiwang(word, pages_start, pages_end):
|
|
|
|
|
edge_options = EdgeOptions()
|
|
|
|
|
edge_options.use_chromium = True
|
|
|
|
|
# edge_options.add_argument('--headless')
|
|
|
|
|
driver = Edge(options=edge_options, executable_path=r'G:\course\yykf\dcs\bin\msedgedriver.exe')
|
|
|
|
|
|
|
|
|
|
soup = driver_open(driver, word)
|
|
|
|
|
papers = [] # 用于保存爬取到的论文
|
|
|
|
|
paper_id = 0
|
|
|
|
|
res = {} # 保存终端爬取结果
|
|
|
|
|
|
|
|
|
|
# 爬取第一篇
|
|
|
|
|
if pages_start == 1:
|
|
|
|
|
spider(driver, soup, papers)
|
|
|
|
|
logger.debug(res)
|
|
|
|
|
pages_start += 1
|
|
|
|
|
while paper_id < len(papers):
|
|
|
|
|
write2res(papers[paper_id], res)
|
|
|
|
|
paper_id += 1
|
|
|
|
|
|
|
|
|
|
while pages_start < pages_end:
|
|
|
|
|
content = change_page(driver, pages_start)
|
|
|
|
|
spider(driver, content, papers)
|
|
|
|
|
while paper_id < len(papers):
|
|
|
|
|
write2res(papers[paper_id], res)
|
|
|
|
|
paper_id += 1
|
|
|
|
|
pages_start += 1
|
|
|
|
|
driver.close()
|
|
|
|
|
logger.debug("here")
|
|
|
|
|
return res
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def write2res(paper: Paper, res):
|
|
|
|
|
for author in paper.authors:
|
|
|
|
|
if author.name:
|
|
|
|
|
res.update(
|
|
|
|
|
{len(res): {'name': author.name, 'college': author.college, 'major': author.major, 'title': paper.title}})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Crawl(threading.Thread):
|
|
|
|
|
def __init__(self):
|
|
|
|
|
super(Crawl, self).__init__()
|
|
|
|
|
self.server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM, socket.IPPROTO_TCP)
|
|
|
|
|
self.server_socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
|
|
|
|
|
self.server_socket.bind(('', 9000))
|
|
|
|
|
self.server_socket.bind(('', 9999))
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def crawl() -> dict:
|
|
|
|
|
result_map = {0: {'name': 'remote', 'college': 'remote', 'major': 'remote', 'title': 'remote'}}
|
|
|
|
|
def crawl(request_map) -> dict:
|
|
|
|
|
result_map = crawl_zhiwang(request_map['word'], request_map['pages_start'], request_map['pages_end'])
|
|
|
|
|
# result_map = {0: {'name': 'remote', 'college': 'remote', 'major': 'remote', 'title': 'remote'},
|
|
|
|
|
# 1: {'name': 'remote1', 'college': 'remote1', 'major': 'remote', 'title': 'remote'}}
|
|
|
|
|
logger.debug(result_map)
|
|
|
|
|
return result_map
|
|
|
|
|
|
|
|
|
|
def run(self) -> None:
|
|
|
|
@ -26,8 +74,8 @@ class Crawl(threading.Thread):
|
|
|
|
|
client_socket, _ = self.server_socket.accept()
|
|
|
|
|
request_map = parse_request(client_socket)
|
|
|
|
|
if request_map['type'] == 'request':
|
|
|
|
|
print("receiving help request:\n"+json.dumps(request_map, ensure_ascii=False))
|
|
|
|
|
response_map = self.crawl()
|
|
|
|
|
print("receiving help request:\n" + json.dumps(request_map, ensure_ascii=False))
|
|
|
|
|
response_map = self.crawl(request_map)
|
|
|
|
|
response_map.update({'cookie': request_map['cookie']})
|
|
|
|
|
client_socket.sendall(generate_response(response_map))
|
|
|
|
|
if request_map['type'] == 'response':
|
|
|
|
@ -37,42 +85,41 @@ class Crawl(threading.Thread):
|
|
|
|
|
|
|
|
|
|
crawl = Crawl()
|
|
|
|
|
crawl.start()
|
|
|
|
|
# crawl.join()
|
|
|
|
|
|
|
|
|
|
with socket.socket(socket.AF_INET, socket.SOCK_STREAM, socket.IPPROTO_TCP) as socket_to_server:
|
|
|
|
|
socket_to_server.bind(('127.0.0.1', 9000))
|
|
|
|
|
socket_to_server.bind(('127.0.0.1', 9999))
|
|
|
|
|
socket_to_server.connect(('127.0.0.1', 7777))
|
|
|
|
|
|
|
|
|
|
request = {'action': 'register', 'user': 'wufayuan', 'password': '113818'}
|
|
|
|
|
# request = {'action': 'register', 'user': 'wufayuan', 'password': '113818'}
|
|
|
|
|
# socket_to_server.sendall(mp.generate_request(request))
|
|
|
|
|
# responseJson = JSONDecoder().decode(
|
|
|
|
|
# mp.read_bytes(socket_to_server, struct.unpack('!Q', socket_to_server.recv(8))[0]).decode(
|
|
|
|
|
# "utf-8"))
|
|
|
|
|
# print(responseJson)
|
|
|
|
|
|
|
|
|
|
request = {'action': 'login', 'user': 'wufayuan', 'password': '113818'}
|
|
|
|
|
socket_to_server.sendall(mp.generate_request(request))
|
|
|
|
|
responseJson = JSONDecoder().decode(
|
|
|
|
|
mp.read_bytes(socket_to_server, struct.unpack('!Q', socket_to_server.recv(8))[0]).decode(
|
|
|
|
|
"utf-8"))
|
|
|
|
|
cookie = responseJson['cookie']
|
|
|
|
|
print(responseJson)
|
|
|
|
|
|
|
|
|
|
request = {'action': 'login', 'user': 'wufayuan', 'password': '113818'}
|
|
|
|
|
request = {'action': 'report_free', 'cookie': cookie}
|
|
|
|
|
socket_to_server.sendall(mp.generate_request(request))
|
|
|
|
|
responseJson = JSONDecoder().decode(
|
|
|
|
|
mp.read_bytes(socket_to_server, struct.unpack('!Q', socket_to_server.recv(8))[0]).decode(
|
|
|
|
|
"utf-8"))
|
|
|
|
|
cookie = responseJson['cookie']
|
|
|
|
|
print(responseJson)
|
|
|
|
|
|
|
|
|
|
request = {'action': 'report_free', 'cookie': cookie}
|
|
|
|
|
request = {'action': 'crawl zhiwang', 'word': 'science', 'pages_start': 1, 'pages_end': 3,
|
|
|
|
|
'cookie': cookie}
|
|
|
|
|
socket_to_server.sendall(mp.generate_request(request))
|
|
|
|
|
responseJson = JSONDecoder().decode(
|
|
|
|
|
mp.read_bytes(socket_to_server, struct.unpack('!Q', socket_to_server.recv(8))[0]).decode(
|
|
|
|
|
"utf-8"))
|
|
|
|
|
print(responseJson)
|
|
|
|
|
|
|
|
|
|
# request = {'action': 'crawl zhiwang', 'word': 'computer', 'pages_start': 1, 'pages_end': 10,
|
|
|
|
|
# 'cookie': cookie}
|
|
|
|
|
# socket_to_server.sendall(mp.generate_request(request))
|
|
|
|
|
# responseJson = JSONDecoder().decode(
|
|
|
|
|
# mp.read_bytes(socket_to_server, struct.unpack('!Q', socket_to_server.recv(8))[0]).decode(
|
|
|
|
|
# "utf-8"))
|
|
|
|
|
# print(responseJson)
|
|
|
|
|
|
|
|
|
|
request = {'action': 'end'}
|
|
|
|
|
socket_to_server.sendall(mp.generate_request(request))
|
|
|
|
|
|
|
|
|
|