forked from p3t2ja9zs/dcs
重构了整个项目,使之更符合“多个系统-相互协同”的模型,同时,实现了爬虫任务系统及其分发与远程和本地结果组合。多系统采用轮询的方式,一旦接受任务就开启对应的执行线程,整个系统得以真正实现多用户同时访问。此外,完善了cookie机制,包括用户认证与识别,完善了从数据库中提取数据并组合,完善了多系统协调机制,初步实现了客户端的初始代码等等
parent
a1a73aa412
commit
b1a90b646c
@ -1,2 +1,2 @@
|
||||
!/dcs/tests/zhiwang.py
|
||||
!/dcs/tests/cookie.py
|
||||
!/dcs/tools/cookie.py
|
||||
|
@ -1,5 +1,5 @@
|
||||
[server]
|
||||
port = 7777
|
||||
daemon = True
|
||||
|
||||
buffer_size = 8 * 1024 * 1024
|
||||
|
||||
|
@ -0,0 +1,79 @@
|
||||
import json
|
||||
import socket
|
||||
import struct
|
||||
from json import JSONDecoder
|
||||
import threading
|
||||
|
||||
from dcs.tools import message_process as mp
|
||||
from dcs.tools.message_process import parse_request, generate_response
|
||||
|
||||
|
||||
class Crawl(threading.Thread):
|
||||
def __init__(self):
|
||||
super(Crawl, self).__init__()
|
||||
self.server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM, socket.IPPROTO_TCP)
|
||||
self.server_socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
|
||||
self.server_socket.bind(('', 9000))
|
||||
|
||||
@staticmethod
|
||||
def crawl() -> dict:
|
||||
result_map = {0: {'name': 'remote', 'college': 'remote', 'major': 'remote', 'paper': 'remote'}}
|
||||
return result_map
|
||||
|
||||
def run(self) -> None:
|
||||
self.server_socket.listen()
|
||||
while True:
|
||||
client_socket, _ = self.server_socket.accept()
|
||||
request_map = parse_request(client_socket)
|
||||
if request_map['type'] == 'request':
|
||||
print("receiving help request:\n"+json.dumps(request_map, ensure_ascii=False))
|
||||
response_map = self.crawl()
|
||||
response_map.update({'cookie': request_map['cookie']})
|
||||
client_socket.sendall(generate_response(response_map))
|
||||
if request_map['type'] == 'response':
|
||||
print("receiving response:\n" + json.dumps(request_map, ensure_ascii=False))
|
||||
# break
|
||||
|
||||
|
||||
crawl = Crawl()
|
||||
crawl.start()
|
||||
# crawl.join()
|
||||
|
||||
with socket.socket(socket.AF_INET, socket.SOCK_STREAM, socket.IPPROTO_TCP) as socket_to_server:
|
||||
socket_to_server.bind(('127.0.0.1', 9000))
|
||||
socket_to_server.connect(('127.0.0.1', 7777))
|
||||
|
||||
request = {'action': 'register', 'user': 'wufayuan', 'password': '113818'}
|
||||
socket_to_server.sendall(mp.generate_request(request))
|
||||
responseJson = JSONDecoder().decode(
|
||||
mp.read_bytes(socket_to_server, struct.unpack('!Q', socket_to_server.recv(8))[0]).decode(
|
||||
"utf-8"))
|
||||
print(responseJson)
|
||||
|
||||
request = {'action': 'login', 'user': 'wufayuan', 'password': '113818'}
|
||||
socket_to_server.sendall(mp.generate_request(request))
|
||||
responseJson = JSONDecoder().decode(
|
||||
mp.read_bytes(socket_to_server, struct.unpack('!Q', socket_to_server.recv(8))[0]).decode(
|
||||
"utf-8"))
|
||||
cookie = responseJson['cookie']
|
||||
print(responseJson)
|
||||
|
||||
request = {'action': 'report_free', 'cookie': cookie}
|
||||
socket_to_server.sendall(mp.generate_request(request))
|
||||
responseJson = JSONDecoder().decode(
|
||||
mp.read_bytes(socket_to_server, struct.unpack('!Q', socket_to_server.recv(8))[0]).decode(
|
||||
"utf-8"))
|
||||
print(responseJson)
|
||||
|
||||
request = {'action': 'crawl zhiwang', 'word': 'computer', 'pages_start': 1, 'pages_end': 10,
|
||||
'cookie': cookie}
|
||||
socket_to_server.sendall(mp.generate_request(request))
|
||||
responseJson = JSONDecoder().decode(
|
||||
mp.read_bytes(socket_to_server, struct.unpack('!Q', socket_to_server.recv(8))[0]).decode(
|
||||
"utf-8"))
|
||||
print(responseJson)
|
||||
|
||||
request = {'action': 'end'}
|
||||
socket_to_server.sendall(mp.generate_request(request))
|
||||
|
||||
crawl.join()
|
@ -0,0 +1,35 @@
|
||||
import threading
|
||||
import socket
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from dcs.tools.message_process import generate_response, generate_request
|
||||
|
||||
|
||||
class Communicator(threading.Thread):
|
||||
def __init__(self):
|
||||
super(Communicator, self).__init__()
|
||||
self.responser_list: list[tuple[str, socket.socket, dict]] = []
|
||||
self.info_list: list[tuple[tuple, dict]] = []
|
||||
|
||||
def add_response(self, response_type: str, client_socket: socket.socket, response_map: dict):
|
||||
self.responser_list.append((response_type, client_socket, response_map))
|
||||
|
||||
def add_info(self, info_type: str, address: tuple, info_map: dict):
|
||||
info_map.update({'type': info_type})
|
||||
self.info_list.append((address, info_map))
|
||||
|
||||
def run(self) -> None:
|
||||
while True:
|
||||
for responser in self.responser_list:
|
||||
response_type, client_socket, response_map = responser[0], responser[1], responser[2]
|
||||
logger.info(f'sending response to {client_socket.getpeername()}: {response_map}')
|
||||
client_socket.sendall(generate_response(response_map))
|
||||
self.responser_list.remove(responser)
|
||||
|
||||
with socket.socket(socket.AF_INET, socket.SOCK_STREAM, socket.IPPROTO_TCP) as socket_to_client:
|
||||
for info in self.info_list:
|
||||
socket_to_client.connect(info[0])
|
||||
logger.info(f'sending response to {info[0]}: {info[1]}')
|
||||
socket_to_client.sendall(generate_request(info[1]))
|
||||
self.info_list.remove(info)
|
File diff suppressed because it is too large
Load Diff
@ -1,22 +1,52 @@
|
||||
# -*- coding: UTF-8 -*-
|
||||
from dcs.tests.server import Server
|
||||
from dcs.server import Server
|
||||
from configparser import ConfigParser
|
||||
from loguru import logger
|
||||
from dcs.tests.database import create_user_info
|
||||
from dcs.tools.database import create_user_info
|
||||
from dcs.requester import Requester
|
||||
from dcs.spider import Spider
|
||||
from conf.config import global_var
|
||||
from dcs.user_process import UP
|
||||
from dcs.communicate import Communicator
|
||||
|
||||
create_user_info()
|
||||
logger.add('./dcs.log', rotation='10 MB', enqueue=True, backtrace=True, diagnose=True)
|
||||
logger.info('reading config args...')
|
||||
logger.debug('reading config args...')
|
||||
configFile = '../conf/settings.ini'
|
||||
con = ConfigParser()
|
||||
con.read(configFile, encoding='utf-8')
|
||||
items = con.items('server')
|
||||
items = dict(items)
|
||||
|
||||
logger.info('starting the server...')
|
||||
server = Server(int(items['port']))
|
||||
server.daemon = items['daemon']
|
||||
server.start()
|
||||
server.join()
|
||||
logger.debug('starting the main server...')
|
||||
global_var.server = Server(int(items['port']), eval(items['buffer_size']))
|
||||
global_var.server.daemon = items['daemon']
|
||||
global_var.server.start()
|
||||
|
||||
logger.debug('starting the requester server...')
|
||||
global_var.requester = Requester()
|
||||
global_var.requester.daemon = True
|
||||
global_var.requester.start()
|
||||
|
||||
logger.debug('starting the spider server...')
|
||||
global_var.spider = Spider()
|
||||
global_var.spider.daemon = True
|
||||
global_var.spider.start()
|
||||
|
||||
logger.debug('starting the user server...')
|
||||
global_var.up = UP()
|
||||
global_var.up.daemon = True
|
||||
global_var.up.start()
|
||||
|
||||
logger.debug('starting the communicator server...')
|
||||
global_var.communicator = Communicator()
|
||||
global_var.communicator.daemon = True
|
||||
global_var.communicator.start()
|
||||
|
||||
global_var.server.join()
|
||||
global_var.requester.join()
|
||||
global_var.spider.join()
|
||||
global_var.up.join()
|
||||
global_var.communicator.join()
|
||||
|
||||
logger.warning('Overing...')
|
||||
|
@ -0,0 +1,92 @@
|
||||
import socket
|
||||
import struct
|
||||
import threading
|
||||
from collections import deque
|
||||
from json import JSONEncoder, JSONDecoder
|
||||
from time import sleep
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from conf.config import set_crawl_result
|
||||
from dcs.tests.spider_task import Spider_partial_task
|
||||
|
||||
|
||||
def read_bytes(s: 'socket.socket', size: 'int') -> 'bytes':
|
||||
data = ''.encode('utf-8')
|
||||
while len(data) < size:
|
||||
rsp_data = s.recv(size - len(data))
|
||||
data += rsp_data
|
||||
if len(rsp_data) == 0:
|
||||
break
|
||||
return data
|
||||
|
||||
|
||||
def generate_request(request) -> 'bytes':
|
||||
request_bytes = JSONEncoder().encode(request).encode("utf-8")
|
||||
return struct.pack("!Q", len(request_bytes)) + request_bytes
|
||||
|
||||
|
||||
class Requester(threading.Thread):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.daemon = True
|
||||
self.reqs = []
|
||||
pass
|
||||
|
||||
def is_remote_task_complete(self, client_address, request_map):
|
||||
pass
|
||||
# for req in self.reqs:
|
||||
# if req[0].client_address == client_address and req[0].request_map == request_map:
|
||||
# print(req[1])
|
||||
# return req[1]
|
||||
|
||||
def get(self, client_address, task: Spider_partial_task):
|
||||
logger.info(f'sending crawl request to {str(client_address)}')
|
||||
req = Req(client_address, task)
|
||||
self.reqs.append(req)
|
||||
req.start()
|
||||
|
||||
|
||||
class Req(threading.Thread):
|
||||
def __init__(self, client_address, task: Spider_partial_task):
|
||||
super(Req, self).__init__()
|
||||
self.client_address = client_address
|
||||
self.task: Spider_partial_task = task
|
||||
self.request_map = task.request_map
|
||||
self.responseJson = None
|
||||
|
||||
def run(self) -> None:
|
||||
with socket.socket(socket.AF_INET, socket.SOCK_STREAM, socket.IPPROTO_TCP) as socket_to_client:
|
||||
socket_to_client.connect(self.client_address)
|
||||
self.request_map.update({'type': 'request'})
|
||||
socket_to_client.sendall(generate_request(self.request_map))
|
||||
self.responseJson = JSONDecoder().decode(
|
||||
read_bytes(socket_to_client, struct.unpack('!Q', socket_to_client.recv(8))[0]).decode(
|
||||
"utf-8"))
|
||||
# res.append(self.responseJson)
|
||||
# sleep(10)
|
||||
cookie = self.responseJson['cookie']
|
||||
del self.responseJson['cookie']
|
||||
logger.debug('receiving remote task result, saving...')
|
||||
set_crawl_result(cookie, self.responseJson)
|
||||
self.task.pages_start = self.task.pages_end # finished
|
||||
self.task.thread = None
|
||||
logger.debug("result: "+str(self.responseJson))
|
||||
# global_var.requester.set_req_state(self.client_address, self.request_map, True)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
address = ('127.0.0.1', 7798)
|
||||
address1 = ('127.0.0.1', 7799)
|
||||
my_request = {'request': 'I am asking you...'}
|
||||
my_request1 = {'request1': 'I am asking you...'}
|
||||
res = deque()
|
||||
requester = Requester()
|
||||
requester.start()
|
||||
|
||||
# requester.get(address, my_request)
|
||||
# requester.get(address1, my_request1)
|
||||
sleep(2)
|
||||
# print(requester1.get())
|
||||
# print(requester.get())
|
||||
print(res)
|
@ -0,0 +1,44 @@
|
||||
import socket
|
||||
import threading
|
||||
from dcs.tests.spider_task import Spider_task
|
||||
|
||||
|
||||
class Spider(threading.Thread):
|
||||
def __init__(self):
|
||||
super(Spider, self).__init__()
|
||||
# self.tasks: list[tuple[socket.socket, dict]] = []
|
||||
self.tasks: list[Spider_task] = []
|
||||
self.daemon = True
|
||||
|
||||
def add_task(self, request_map: dict, client_socket: socket.socket):
|
||||
# distribute tasks, 5 pages as a task
|
||||
# [pages_start, pages_end), like [1,3) means 1,2 page
|
||||
# pages_start = request_map['pages_start']
|
||||
# pages_end = request_map['pages_end']
|
||||
# while pages_start < pages_end:
|
||||
# tmp = request_map.copy()
|
||||
# tmp['pages_start'] = pages_start
|
||||
# if pages_start + const_page <= pages_end:
|
||||
# pages_start += const_page
|
||||
# else:
|
||||
# pages_start = pages_end
|
||||
# tmp['pages_end'] = pages_start
|
||||
# self.tasks.append((client_socket, tmp))
|
||||
self.tasks.append(Spider_task(client_socket, request_map))
|
||||
|
||||
def run(self) -> None:
|
||||
while True:
|
||||
# free_remote_nodes = get_free_sockets()
|
||||
# for task in self.tasks:
|
||||
# logger.info(f'processing crawl task...')
|
||||
# for f_node in free_remote_nodes:
|
||||
# address = f_node.getpeername()
|
||||
# global_var.requester.get(address, task[1])
|
||||
# global_var.responser.add_response('crawling state', task[0], {'crawling state': 'starting, please wait...'})
|
||||
# spider_task = Spider_task(task[0], task[1])
|
||||
# spider_task.start()
|
||||
# self.tasks.remove(task)
|
||||
for task in self.tasks:
|
||||
task.start()
|
||||
self.tasks.remove(task)
|
||||
pass
|
@ -1,43 +0,0 @@
|
||||
import os
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
#爬虫头数据
|
||||
cookies = {
|
||||
'SINAGLOBAL': '6797875236621.702.1603159218040',
|
||||
'SUB': '_2AkMXbqMSf8NxqwJRmfkTzmnhboh1ygvEieKhMlLJJRMxHRl-yT9jqmg8tRB6PO6N_Rc_2FhPeZF2iThYO9DfkLUGpv4V',
|
||||
'SUBP': '0033WrSXqPxfM72-Ws9jqgMF55529P9D9Wh-nU-QNDs1Fu27p6nmwwiJ',
|
||||
'_s_tentry': 'www.baidu.com',
|
||||
'UOR': 'www.hfut.edu.cn,widget.weibo.com,www.baidu.com',
|
||||
'Apache': '7782025452543.054.1635925669528',
|
||||
'ULV': '1635925669554:15:1:1:7782025452543.054.1635925669528:1627316870256',
|
||||
}
|
||||
headers = {
|
||||
'Connection': 'keep-alive',
|
||||
'Cache-Control': 'max-age=0',
|
||||
'Upgrade-Insecure-Requests': '1',
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36 SLBrowser/7.0.0.6241 SLBChan/25',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
|
||||
'Sec-Fetch-Site': 'cross-site',
|
||||
'Sec-Fetch-Mode': 'navigate',
|
||||
'Sec-Fetch-User': '?1',
|
||||
'Sec-Fetch-Dest': 'document',
|
||||
'Accept-Language': 'zh-CN,zh;q=0.9',
|
||||
}
|
||||
params = (
|
||||
('cate', 'realtimehot'),
|
||||
)
|
||||
#数据存储
|
||||
fo = open("./微博热搜.txt",'a',encoding="utf-8")
|
||||
#获取网页
|
||||
response = requests.get('https://s.weibo.com/top/summary', headers=headers, params=params, cookies=cookies)
|
||||
#解析网页
|
||||
response.encoding='utf-8'
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
#爬取内容
|
||||
content="#pl_top_realtimehot > table > tbody > tr > td.td-02 > a"
|
||||
#清洗数据
|
||||
a=soup.select(content)
|
||||
for i in range(0,len(a)):
|
||||
a[i] = a[i].text
|
||||
fo.write(a[i]+'\n')
|
||||
fo.close()
|
@ -1,75 +1,32 @@
|
||||
import socket
|
||||
import threading
|
||||
import json
|
||||
import struct
|
||||
from loguru import logger
|
||||
from dcs.tests.spider import Spider
|
||||
from dcs.tests.user_request_handler import Urh
|
||||
|
||||
|
||||
def generate_response(response):
|
||||
response_binary = json.JSONEncoder().encode(response).encode("utf-8")
|
||||
response_binary_len = len(response_binary)
|
||||
response_binary_len_binary = struct.pack("!Q", response_binary_len)
|
||||
response_binary = response_binary_len_binary + response_binary
|
||||
|
||||
return response_binary
|
||||
from loguru import logger
|
||||
from dcs.tools.message_process import parse_request, check
|
||||
from conf.config import global_var
|
||||
|
||||
|
||||
class RequestHandler(threading.Thread):
|
||||
def __init__(self, file_server, client_socket: 'socket.socket', request_map: 'dict'):
|
||||
def __init__(self, client_socket: 'socket.socket'):
|
||||
super().__init__()
|
||||
self.file_server = file_server
|
||||
self.client_socket = client_socket
|
||||
self.request_map = request_map
|
||||
self.daemon = True
|
||||
pass
|
||||
|
||||
def test(self):
|
||||
logger.info(f"[REQUEST] test")
|
||||
response = {
|
||||
'test': 'hello TEST'
|
||||
}
|
||||
self.client_socket.sendall(generate_response(response))
|
||||
logger.info(f"[RESPONSE] test: {response['test']}")
|
||||
|
||||
def translate(self):
|
||||
logger.info(f"[REQUEST] translate")
|
||||
spider = Spider(self.request_map['word'])
|
||||
response = {
|
||||
'translate': spider.run()
|
||||
}
|
||||
self.client_socket.sendall(generate_response(response))
|
||||
logger.info(f"[RESPONSE] translate: {response['translate']}")
|
||||
|
||||
def crawl_zhiwang(self):
|
||||
logger.info(f"[REQUEST] crawl zhiwang")
|
||||
try:
|
||||
pages_start = int(self.request_map['pages_start'])
|
||||
pages_end = int(self.request_map['pages_end'])
|
||||
except:
|
||||
pages_start = 1
|
||||
pages_end = 1
|
||||
spider = Spider(self.request_map['word'], pages_start=pages_start, pages_end=pages_end)
|
||||
spider.run()
|
||||
response = {
|
||||
'crawl zhiwang': 'success' # TODO
|
||||
}
|
||||
self.client_socket.sendall(generate_response(response))
|
||||
logger.info(f"[RESPONSE] crawl zhiwang: {response['crawl zhiwang']}")
|
||||
|
||||
def run(self) -> None:
|
||||
try:
|
||||
if self.request_map['action'] == 'test':
|
||||
self.test()
|
||||
self.client_socket.close()
|
||||
elif self.request_map['action'] == 'translate':
|
||||
self.translate()
|
||||
elif self.request_map['action'] == 'crawl zhiwang':
|
||||
self.crawl_zhiwang()
|
||||
self.client_socket.close()
|
||||
elif self.request_map['action'] in ['report_free', 'login', 'register']:
|
||||
urh = Urh(self.request_map, self.client_socket)
|
||||
urh.start()
|
||||
except:
|
||||
pass
|
||||
while True:
|
||||
request_map = parse_request(self.client_socket)
|
||||
|
||||
if request_map['action'] == 'end':
|
||||
logger.info(f"[REQUEST] end")
|
||||
logger.debug(f"communication over from {self.client_socket.getpeername()}!")
|
||||
break
|
||||
elif request_map['action'] == 'crawl zhiwang':
|
||||
chk_res = check(request_map['cookie'])
|
||||
if chk_res is None:
|
||||
logger.warning("user info error!")
|
||||
break
|
||||
global_var.spider.add_task(request_map, self.client_socket)
|
||||
elif request_map['action'] in ['report_free', 'login', 'register']:
|
||||
global_var.up.add_request(request_map, self.client_socket)
|
||||
except Exception as e:
|
||||
print(str(e))
|
||||
|
@ -1,72 +0,0 @@
|
||||
import socket
|
||||
import threading
|
||||
import struct
|
||||
from json import JSONEncoder, JSONDecoder
|
||||
from time import sleep
|
||||
|
||||
from loguru import logger
|
||||
from collections import deque
|
||||
|
||||
|
||||
def read_bytes(s: 'socket.socket', size: 'int') -> 'bytes':
|
||||
data = ''.encode('utf-8')
|
||||
while len(data) < size:
|
||||
rsp_data = s.recv(size - len(data))
|
||||
data += rsp_data
|
||||
if len(rsp_data) == 0:
|
||||
break
|
||||
return data
|
||||
|
||||
|
||||
def generate_request(request) -> 'bytes':
|
||||
request_bytes = JSONEncoder().encode(request).encode("utf-8")
|
||||
return struct.pack("!Q", len(request_bytes)) + request_bytes
|
||||
|
||||
|
||||
class Requester(threading.Thread):
|
||||
def __init__(self, request_map: 'dict', client_address: tuple):
|
||||
super().__init__()
|
||||
self.request_map = request_map
|
||||
self.daemon = True
|
||||
self.client_address = client_address
|
||||
self.responseJson = None
|
||||
pass
|
||||
|
||||
def request(self):
|
||||
with socket.socket(socket.AF_INET, socket.SOCK_STREAM, socket.IPPROTO_TCP) as socket_to_client:
|
||||
socket_to_client.connect(self.client_address)
|
||||
socket_to_client.sendall(generate_request(self.request_map))
|
||||
self.responseJson = JSONDecoder().decode(
|
||||
read_bytes(socket_to_client, struct.unpack('!Q', socket_to_client.recv(8))[0]).decode(
|
||||
"utf-8"))
|
||||
res.append(self.responseJson)
|
||||
return res
|
||||
|
||||
def get(self):
|
||||
try:
|
||||
return self.request()
|
||||
except:
|
||||
return None
|
||||
|
||||
def run(self) -> None:
|
||||
self.request()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
address = ('127.0.0.1', 7798)
|
||||
address1 = ('127.0.0.1', 7799)
|
||||
my_request = {'request': 'I am asking you...'}
|
||||
my_request1 = {'request1': 'I am asking you...'}
|
||||
res = deque()
|
||||
|
||||
requester = Requester(my_request, address)
|
||||
requester1 = Requester(my_request1, address1)
|
||||
requester1.start()
|
||||
requester.start()
|
||||
|
||||
requester1.join()
|
||||
requester.join()
|
||||
sleep(2)
|
||||
# print(requester1.get())
|
||||
# print(requester.get())
|
||||
print(res)
|
@ -0,0 +1,232 @@
|
||||
import csv
|
||||
import socket
|
||||
import threading
|
||||
from typing import Optional
|
||||
|
||||
from loguru import logger
|
||||
from msedge.selenium_tools import Edge
|
||||
from msedge.selenium_tools import EdgeOptions
|
||||
|
||||
from conf.config import global_var, get_free_sockets, get_crawl_result, get_by_cookie, set_state_socket
|
||||
from dcs.tools.database import get_crawl_result_by_crawl_id, write_result2database
|
||||
from dcs.tools.database import get_last_crawl_id, create_crawl_result_table
|
||||
from dcs.tests.zhiwang import *
|
||||
|
||||
|
||||
def translate(word):
|
||||
url = 'http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule'
|
||||
data = {'i': word,
|
||||
'from': 'AUTO',
|
||||
'to': 'AUTO',
|
||||
'smartresult': 'dict',
|
||||
'client': 'fanyideskweb',
|
||||
'doctype': 'json',
|
||||
'version': '2.1',
|
||||
'keyfrom': 'fanyi.web',
|
||||
'action': 'FY_BY_REALTIME',
|
||||
'typoResult': 'false'}
|
||||
r = requests.post(url, data)
|
||||
answer = r.json()
|
||||
result = answer['translateResult'][0][0]['tgt']
|
||||
return result
|
||||
|
||||
|
||||
def write2database(paper: Paper, table_name: str, last_crawl_id: int):
|
||||
logger.info(f'writing to database: {paper.title}')
|
||||
for author in paper.authors:
|
||||
if author.name:
|
||||
write_result2database([author.name, author.college, author.major, paper.title], table_name, last_crawl_id)
|
||||
|
||||
|
||||
def write2csv(papers: list, file_name='./paper_author.csv'):
|
||||
# 写入文件
|
||||
f_papers_authors = open(file_name, 'w', encoding='utf-8', newline='')
|
||||
writer_p_a = csv.writer(f_papers_authors) # 基于文件对象构建 csv写入对象
|
||||
writer_p_a.writerow(["name", "college", "major", "paper"]) # csv文件的表头
|
||||
|
||||
# 读取每一篇论文
|
||||
for paper in papers:
|
||||
# 写入paper_author.csv文件
|
||||
for author in paper.authors:
|
||||
if author.name:
|
||||
# print(author + " ")
|
||||
writer_p_a.writerow([author.name, author.college, author.major, paper.title])
|
||||
|
||||
# 关闭文件
|
||||
f_papers_authors.close()
|
||||
|
||||
|
||||
class Crawler(threading.Thread):
|
||||
def __init__(self, partial_task: 'Spider_partial_task'):
|
||||
super(Crawler, self).__init__()
|
||||
self.partial_task = partial_task
|
||||
|
||||
def crawl_zhiwang(self, user_name=None):
|
||||
edge_options = EdgeOptions()
|
||||
edge_options.use_chromium = True
|
||||
# edge_options.add_argument('headless')
|
||||
driver = Edge(options=edge_options, executable_path=r'G:\Users\god\PycharmProjects\dcs\bin\msedgedriver.exe')
|
||||
soup = driver_open(driver, self.partial_task.word) # 搜索word
|
||||
|
||||
papers = [] # 用于保存爬取到的论文
|
||||
table_name = f'{user_name}_crawl_result'
|
||||
create_crawl_result_table(table_name=table_name)
|
||||
last_crawl_id = get_last_crawl_id(table_name=table_name)
|
||||
self.partial_task.crawl_id = last_crawl_id + 1
|
||||
paper_id = 0
|
||||
|
||||
# 爬取第一篇
|
||||
if self.partial_task.pages_start == 1:
|
||||
spider(driver, soup, papers)
|
||||
self.partial_task.pages_start += 1
|
||||
while paper_id < len(papers):
|
||||
write2database(papers[paper_id], table_name=table_name, last_crawl_id=last_crawl_id)
|
||||
paper_id += 1
|
||||
|
||||
while self.partial_task.pages_start < self.partial_task.pages_end:
|
||||
content = change_page(driver, self.partial_task.pages_start)
|
||||
spider(driver, content, papers)
|
||||
while paper_id < len(papers):
|
||||
write2database(papers[paper_id], table_name=table_name, last_crawl_id=last_crawl_id)
|
||||
paper_id += 1
|
||||
self.partial_task.pages_start += 1
|
||||
driver.close()
|
||||
|
||||
def test_simulation(self, user_name):
|
||||
table_name = f'{user_name}_crawl_result'
|
||||
create_crawl_result_table(table_name=table_name)
|
||||
last_crawl_id = get_last_crawl_id(table_name=table_name)
|
||||
self.partial_task.crawl_id = last_crawl_id + 1
|
||||
|
||||
# 模拟爬取
|
||||
logger.debug('simulation crawling...')
|
||||
paper = Paper('test', [Author('test', 'test', 'test')])
|
||||
write2database(paper, table_name=table_name, last_crawl_id=last_crawl_id)
|
||||
write2database(paper, table_name=table_name, last_crawl_id=last_crawl_id)
|
||||
write2database(paper, table_name=table_name, last_crawl_id=last_crawl_id)
|
||||
|
||||
# over
|
||||
self.partial_task.pages_start = self.partial_task.pages_end
|
||||
|
||||
def run(self) -> None:
|
||||
try:
|
||||
# self.crawl_zhiwang(user_name=self.partial_task.cui.user_name)
|
||||
self.test_simulation(user_name=self.partial_task.cui.user_name)
|
||||
except Exception as e:
|
||||
print(e)
|
||||
finally:
|
||||
logger.info(f'partial crawl task finished: {str(self.partial_task)}')
|
||||
self.partial_task.thread = None
|
||||
# self.partial_task.pages_start = self.partial_task.pages_end
|
||||
|
||||
|
||||
class Spider_partial_task:
|
||||
def __init__(self, full_task: 'Spider_task', request_map: dict):
|
||||
self.full_task: Spider_task = full_task
|
||||
self.request_map = request_map
|
||||
self.thread: 'threading.Thread|None' = None
|
||||
self.word = self.request_map['word']
|
||||
self.pages_start = self.request_map['pages_start']
|
||||
self.pages_end = self.request_map['pages_end']
|
||||
self.cui = get_by_cookie(self.request_map['cookie'])
|
||||
self.task_type: Optional[str] = None
|
||||
self.crawl_id = None
|
||||
|
||||
def is_partial_task_crawl_completely(self):
|
||||
return self.pages_start == self.pages_end
|
||||
|
||||
def __str__(self):
|
||||
return f'{self.full_task.client_socket.getpeername(), self.request_map}'
|
||||
|
||||
|
||||
class Spider_task(threading.Thread):
|
||||
def __init__(self, client_socket: socket.socket, request_map: dict):
|
||||
super().__init__()
|
||||
self.client_socket = client_socket
|
||||
self.request_map = request_map
|
||||
self.partial_tasks: list[Spider_partial_task] = []
|
||||
self.const_page = 3
|
||||
pass
|
||||
|
||||
def distribute_task(self):
|
||||
# distribute tasks, 3 pages as a task
|
||||
# [pages_start, pages_end), like [1,3) means 1,2 page
|
||||
logger.info(f'distributing task: {self.client_socket.getpeername(), self.request_map}')
|
||||
pages_start = self.request_map['pages_start']
|
||||
pages_end = self.request_map['pages_end']
|
||||
while pages_start < pages_end:
|
||||
tmp = self.request_map.copy()
|
||||
tmp['pages_start'] = pages_start
|
||||
if pages_start + self.const_page < pages_end:
|
||||
pages_start += self.const_page
|
||||
else:
|
||||
pages_start = pages_end
|
||||
tmp['pages_end'] = pages_start
|
||||
# self.tasks.append((client_socket, tmp))
|
||||
self.partial_tasks.append(Spider_partial_task(self, tmp))
|
||||
logger.debug(self.partial_tasks)
|
||||
|
||||
def is_all_task_crawled(self):
|
||||
for task in self.partial_tasks:
|
||||
# print(task.task_type, task.is_partial_task_crawl_completely())
|
||||
if not task.is_partial_task_crawl_completely():
|
||||
return False
|
||||
return True
|
||||
|
||||
def compose_result(self):
|
||||
logger.debug('composing task...')
|
||||
result = dict()
|
||||
remote_result = get_crawl_result(self.request_map['cookie'])
|
||||
for result_map in remote_result:
|
||||
result.update(result_map)
|
||||
for task in self.partial_tasks:
|
||||
# print(task.task_type)
|
||||
if task.task_type == 'local':
|
||||
local_result = dict()
|
||||
local_result_database = get_crawl_result_by_crawl_id(
|
||||
f"{get_by_cookie(task.request_map['cookie']).user_name}_crawl_result",
|
||||
task.crawl_id)
|
||||
initial_id = local_result_database[0][0]
|
||||
for res in local_result_database:
|
||||
local_result.update({res[0]-initial_id+1: {'name': res[1], 'college': res[2], 'major': res[3], 'paper': res[4]}})
|
||||
logger.debug(local_result)
|
||||
result.update(local_result)
|
||||
|
||||
global_var.communicator.add_info('response', self.client_socket.getpeername(), result)
|
||||
|
||||
def run(self) -> None:
|
||||
global_var.communicator.add_response('crawling state', self.client_socket,
|
||||
{'crawling state': 'starting, please wait...'})
|
||||
self.distribute_task()
|
||||
|
||||
free_remote_nodes = list(get_free_sockets())
|
||||
logger.debug(free_remote_nodes)
|
||||
while True: # necessary otherwise for-cycle only executed once
|
||||
for task in self.partial_tasks:
|
||||
if task.is_partial_task_crawl_completely():
|
||||
continue
|
||||
else:
|
||||
current_task_thread = task.thread
|
||||
if current_task_thread is None:
|
||||
for f_node in free_remote_nodes:
|
||||
# print(free_remote_nodes)
|
||||
address = f_node.getpeername()
|
||||
# address = (address[0], address[1] + 1)
|
||||
logger.debug('generating remote task')
|
||||
task.thread = global_var.requester
|
||||
task.task_type = 'remote'
|
||||
global_var.requester.get(address, task)
|
||||
free_remote_nodes.remove(f_node) # TODO
|
||||
set_state_socket(f_node, 'busy')
|
||||
break
|
||||
else:
|
||||
logger.debug('generating local task')
|
||||
crawler = Crawler(task)
|
||||
task.thread = crawler
|
||||
task.task_type = 'local'
|
||||
crawler.start()
|
||||
if self.is_all_task_crawled():
|
||||
break
|
||||
|
||||
# sleep(3)
|
||||
self.compose_result()
|
@ -0,0 +1,48 @@
|
||||
import socket
|
||||
import json
|
||||
import struct
|
||||
from json import JSONEncoder
|
||||
from conf.config import exists
|
||||
|
||||
|
||||
def parse_request(client_socket: socket.socket):
|
||||
request_header_size = struct.unpack("!Q", read_bytes(client_socket, 8))[0]
|
||||
request_map = json.JSONDecoder().decode(read_bytes(client_socket, request_header_size).decode("utf-8"))
|
||||
return request_map
|
||||
|
||||
|
||||
def generate_response(response):
|
||||
response_binary = json.JSONEncoder().encode(response).encode("utf-8")
|
||||
response_binary_len = len(response_binary)
|
||||
response_binary_len_binary = struct.pack("!Q", response_binary_len)
|
||||
response_binary = response_binary_len_binary + response_binary
|
||||
|
||||
return response_binary
|
||||
|
||||
|
||||
def read_bytes(s: 'socket.socket', size: 'int') -> 'bytes':
|
||||
data = ''.encode('utf-8')
|
||||
while len(data) < size:
|
||||
rsp_data = s.recv(size - len(data))
|
||||
data += rsp_data
|
||||
if len(rsp_data) == 0:
|
||||
break
|
||||
return data
|
||||
|
||||
|
||||
def check(cookie: str):
|
||||
if exists(cookie):
|
||||
return cookie
|
||||
return None # TODO: if user error, return None, else return cookie
|
||||
pass
|
||||
|
||||
|
||||
def generate_request(request) -> 'bytes':
|
||||
"""
|
||||
根据传入的dict生成请求
|
||||
请求包含 8字节头长度+头数据
|
||||
:param request: dict
|
||||
:return: bytes 请求数据
|
||||
"""
|
||||
request_bytes = JSONEncoder().encode(request).encode("utf-8")
|
||||
return struct.pack("!Q", len(request_bytes)) + request_bytes
|
@ -0,0 +1,22 @@
|
||||
import socket
|
||||
import threading
|
||||
from dcs.tests.user_request_handler import Urh
|
||||
from loguru import logger
|
||||
|
||||
|
||||
class UP(threading.Thread):
|
||||
def __init__(self):
|
||||
super(UP, self).__init__()
|
||||
self.requests: list[tuple[socket.socket, dict]] = []
|
||||
|
||||
def add_request(self, request_map: dict, client_socket: socket.socket):
|
||||
self.requests.append((client_socket, request_map))
|
||||
|
||||
def run(self) -> None:
|
||||
while True:
|
||||
for request in self.requests:
|
||||
logger.info(f'processing user request...')
|
||||
urh = Urh(request[1], request[0])
|
||||
urh.start()
|
||||
self.requests.remove(request)
|
||||
pass
|
Loading…
Reference in new issue