梳理了项目结构

master
wufayuan 3 years ago
parent e69f4ea071
commit 64a607e50b

@ -457,3 +457,79 @@
2022-03-31 21:16:06.700 | INFO | __main__:<module>:16 - starting the server...
2022-03-31 21:16:08.706 | INFO | dcs.tests.user_request_handler:report_state:26 - [REQUEST] report free
2022-03-31 21:16:08.707 | INFO | dcs.tests.user_request_handler:report_state:32 - [RESPONSE] report free: success marked 2b0fd361bbf0b986fbc20d989a224d66fe9cb13a
2022-03-31 21:56:03.819 | INFO | __main__:<module>:9 - reading config args...
2022-03-31 21:56:03.820 | INFO | __main__:<module>:16 - starting the server...
2022-03-31 21:56:06.132 | INFO | dcs.tests.user_request_handler:report_state:26 - [REQUEST] report free
2022-03-31 21:56:06.132 | INFO | dcs.tests.user_request_handler:report_state:32 - [RESPONSE] report free: success marked 2b0fd361bbf0b986fbc20d989a224d66fe9cb13a
2022-03-31 21:56:14.117 | INFO | dcs.tests.requestHandler:crawl_zhiwang:46 - [REQUEST] crawl zhiwang
2022-03-31 21:56:14.117 | INFO | dcs.tests.spider:run:104 - crawling...
2022-03-31 21:59:43.973 | INFO | __main__:<module>:9 - reading config args...
2022-03-31 21:59:43.974 | INFO | __main__:<module>:16 - starting the server...
2022-03-31 21:59:47.613 | INFO | dcs.tests.user_request_handler:login:35 - [REQUEST] login
2022-03-31 21:59:47.643 | INFO | dcs.tests.user_request_handler:login:42 - [RESPONSE] login: None
2022-03-31 22:00:10.219 | INFO | __main__:<module>:9 - reading config args...
2022-03-31 22:00:10.220 | INFO | __main__:<module>:16 - starting the server...
2022-03-31 22:00:13.984 | INFO | dcs.tests.user_request_handler:login:35 - [REQUEST] login
2022-03-31 22:00:14.029 | INFO | dcs.tests.user_request_handler:login:42 - [RESPONSE] login: 6b95f1badcff627f56e1d1b2b436a66e9d668179
2022-03-31 22:00:22.738 | INFO | dcs.tests.user_request_handler:login:35 - [REQUEST] login
2022-03-31 22:00:22.779 | INFO | dcs.tests.user_request_handler:login:42 - [RESPONSE] login: 2a17833fb66e33e37c5b893f4d11054ba660f39a
2022-03-31 22:00:25.994 | INFO | dcs.tests.user_request_handler:login:35 - [REQUEST] login
2022-03-31 22:00:26.023 | INFO | dcs.tests.user_request_handler:login:42 - [RESPONSE] login: 012f698322e6cd67d75c8bc91f6a9058f9262337
2022-04-01 08:15:26.267 | INFO | __main__:<module>:9 - reading config args...
2022-04-01 08:15:26.268 | INFO | __main__:<module>:16 - starting the server...
2022-04-01 08:15:39.570 | INFO | dcs.tests.user_request_handler:login:35 - [REQUEST] login
2022-04-01 08:15:39.594 | INFO | dcs.tests.user_request_handler:login:42 - [RESPONSE] login: 7afecad3195f6b790418127316607238198b8193
2022-04-01 08:16:31.391 | INFO | __main__:<module>:9 - reading config args...
2022-04-01 08:16:31.394 | INFO | __main__:<module>:16 - starting the server...
2022-04-01 08:16:35.093 | INFO | dcs.tests.user_request_handler:login:35 - [REQUEST] login
2022-04-01 08:16:35.100 | INFO | dcs.tests.user_request_handler:login:42 - [RESPONSE] login: 23eb3a72fb6e9eac029ed47356e4eff0e394768f
2022-04-01 08:17:07.484 | INFO | __main__:<module>:9 - reading config args...
2022-04-01 08:17:07.487 | INFO | __main__:<module>:16 - starting the server...
2022-04-01 08:17:26.291 | INFO | dcs.tests.user_request_handler:login:35 - [REQUEST] login
2022-04-01 08:19:48.255 | INFO | dcs.tests.user_request_handler:login:42 - [RESPONSE] login: 9825b67537b1e2a589ff2570f033d74f33fbaf84
2022-04-01 08:29:59.684 | INFO | __main__:<module>:9 - reading config args...
2022-04-01 08:29:59.685 | INFO | __main__:<module>:16 - starting the server...
2022-04-01 08:30:03.578 | INFO | dcs.tests.user_request_handler:report_state:26 - [REQUEST] report free
2022-04-01 08:30:03.578 | INFO | dcs.tests.user_request_handler:report_state:32 - [RESPONSE] report free: success marked 2b0fd361bbf0b986fbc20d989a224d66fe9cb13a
2022-04-01 08:30:03.579 | INFO | dcs.tests.user_request_handler:login:35 - [REQUEST] login
2022-04-01 08:30:03.618 | INFO | dcs.tests.user_request_handler:login:42 - [RESPONSE] login: 93f0363797d703e85fd8b7f216b2cc3cd33e234d
2022-04-01 08:30:03.618 | INFO | dcs.tests.user_request_handler:report_state:26 - [REQUEST] report free
2022-04-01 08:30:03.619 | INFO | dcs.tests.user_request_handler:report_state:32 - [RESPONSE] report free: success marked 2b0fd361bbf0b986fbc20d989a224d66fe9cb13a
2022-04-01 08:30:03.619 | INFO | dcs.tests.server:run:36 - [REQUEST] end
2022-04-01 08:30:03.620 | WARNING | dcs.tests.server:run:37 - communication over!
2022-04-01 08:30:03.620 | WARNING | __main__:<module>:22 - Overing...
2022-04-01 08:41:16.250 | INFO | __main__:<module>:9 - reading config args...
2022-04-01 08:41:16.251 | INFO | __main__:<module>:16 - starting the server...
2022-04-01 08:41:19.564 | INFO | dcs.tests.user_request_handler:login:35 - [REQUEST] login
2022-04-01 08:41:19.578 | INFO | dcs.tests.user_request_handler:login:42 - [RESPONSE] login: 40456734e87596bb849caa4554a2283a27b6e871
2022-04-01 08:41:19.579 | INFO | dcs.tests.user_request_handler:report_state:26 - [REQUEST] report free
2022-04-01 08:41:19.580 | INFO | dcs.tests.user_request_handler:report_state:32 - [RESPONSE] report free: success marked 40456734e87596bb849caa4554a2283a27b6e871
2022-04-01 08:41:19.581 | INFO | dcs.tests.user_request_handler:report_state:26 - [REQUEST] report free
2022-04-01 08:41:19.581 | INFO | dcs.tests.user_request_handler:report_state:32 - [RESPONSE] report free: success marked 40456734e87596bb849caa4554a2283a27b6e871
2022-04-01 08:41:19.581 | INFO | dcs.tests.server:run:36 - [REQUEST] end
2022-04-01 08:41:19.582 | WARNING | dcs.tests.server:run:37 - communication over!
2022-04-01 08:41:19.582 | WARNING | __main__:<module>:22 - Overing...
2022-04-01 08:42:10.111 | INFO | __main__:<module>:9 - reading config args...
2022-04-01 08:42:10.112 | INFO | __main__:<module>:16 - starting the server...
2022-04-01 08:42:12.631 | INFO | dcs.tests.user_request_handler:login:35 - [REQUEST] login
2022-04-01 08:42:12.659 | INFO | dcs.tests.user_request_handler:login:42 - [RESPONSE] login: 3852e55acf1b5641e3d67991d5d104e8e82d66c2
2022-04-01 08:42:12.660 | INFO | dcs.tests.user_request_handler:report_state:26 - [REQUEST] report free
2022-04-01 08:42:12.660 | INFO | dcs.tests.user_request_handler:report_state:32 - [RESPONSE] report free: success marked 3852e55acf1b5641e3d67991d5d104e8e82d66c2
2022-04-01 08:42:24.187 | INFO | dcs.tests.requestHandler:crawl_zhiwang:46 - [REQUEST] crawl zhiwang
2022-04-01 08:42:24.188 | INFO | dcs.tests.spider:run:154 - crawling...
2022-04-01 08:46:30.210 | INFO | __main__:<module>:9 - reading config args...
2022-04-01 08:46:30.210 | INFO | __main__:<module>:16 - starting the server...
2022-04-01 08:46:32.912 | INFO | dcs.tests.user_request_handler:login:35 - [REQUEST] login
2022-04-01 08:46:32.941 | INFO | dcs.tests.user_request_handler:login:42 - [RESPONSE] login: 91c17091bf6ff1855e24152606b9ee73fee41059
2022-04-01 08:46:32.942 | INFO | dcs.tests.user_request_handler:report_state:26 - [REQUEST] report free
2022-04-01 08:46:32.943 | INFO | dcs.tests.user_request_handler:report_state:32 - [RESPONSE] report free: success marked 91c17091bf6ff1855e24152606b9ee73fee41059
2022-04-01 08:46:42.426 | INFO | dcs.tests.requestHandler:crawl_zhiwang:46 - [REQUEST] crawl zhiwang
2022-04-01 08:46:42.426 | INFO | dcs.tests.spider:run:155 - crawling...
2022-04-08 20:27:02.331 | INFO | __main__:<module>:9 - reading config args...
2022-04-08 20:27:02.333 | INFO | __main__:<module>:16 - starting the server...
2022-04-08 20:27:13.717 | INFO | dcs.tests.user_request_handler:login:35 - [REQUEST] login
2022-04-08 20:27:13.753 | INFO | dcs.tests.user_request_handler:login:42 - [RESPONSE] login: 5bc37fbdcb59701ad127b0a76920326a77a882ba
2022-04-08 20:27:13.754 | INFO | dcs.tests.user_request_handler:report_state:26 - [REQUEST] report free
2022-04-08 20:27:13.754 | INFO | dcs.tests.user_request_handler:report_state:32 - [RESPONSE] report free: success marked 5bc37fbdcb59701ad127b0a76920326a77a882ba
2022-04-08 20:27:20.484 | INFO | dcs.tests.requestHandler:crawl_zhiwang:46 - [REQUEST] crawl zhiwang
2022-04-08 20:27:20.484 | INFO | dcs.tests.spider:run:156 - crawling...

@ -4,6 +4,7 @@ from threading import Thread
import socket
from json import JSONEncoder, JSONDecoder
import sys
from dcs.tests.server import Server
# -------------------------------配置--------------------------------------------
# ------------------------------config--------------------------------------------
@ -43,14 +44,15 @@ def generate_request(request) -> 'bytes':
class Client(Thread):
def __init__(self, ip: str, port: int) -> None:
def __init__(self, server_ip: str, server_port: int) -> None:
"""
:param ip: 服务器IP
:param port: 服务器端口
:param server_ip: 服务器IP
:param server_port: 服务器端口
"""
super().__init__()
self.ip = ip
self.port = port
self.ip = server_ip
self.port = server_port
self.cookie = None
def test(self):
with socket.socket(socket.AF_INET, socket.SOCK_STREAM, socket.IPPROTO_TCP) as socket_to_server:
@ -68,24 +70,6 @@ class Client(Thread):
return responseJson['test']
def translate(self, word: str):
with socket.socket(socket.AF_INET, socket.SOCK_STREAM, socket.IPPROTO_TCP) as socket_to_server:
socket_to_server.connect((self.ip, self.port))
request = dict()
request['action'] = 'translate'
request['word'] = word
request['cookie'] = '2b0fd361bbf0b986fbc20d989a224d66fe9cb13a'
full_request = generate_request(request)
socket_to_server.sendall(full_request)
responseJson = JSONDecoder().decode(
read_bytes(socket_to_server, struct.unpack('!Q', socket_to_server.recv(8))[0]).decode(
"utf-8"))
return responseJson['translate']
def crawling(self, word: str, pages_start: int, pages_end: int):
with socket.socket(socket.AF_INET, socket.SOCK_STREAM, socket.IPPROTO_TCP) as socket_to_server:
socket_to_server.connect((self.ip, self.port))
@ -94,6 +78,7 @@ class Client(Thread):
request['word'] = word
request['pages_start'] = str(pages_start)
request['pages_end'] = str(pages_end)
request['cookie'] = '2b0fd361bbf0b986fbc20d989a224d66fe9cb13a'
full_request = generate_request(request)
@ -112,7 +97,7 @@ class Client(Thread):
request = dict()
request['action'] = 'report_' + status
request['spider_info'] = (ip, port)
request['cookie'] = '2b0fd361bbf0b986fbc20d989a224d66fe9cb13a'
request['cookie'] = self.cookie
full_request = generate_request(request)
@ -122,7 +107,7 @@ class Client(Thread):
read_bytes(socket_to_server, struct.unpack('!Q', socket_to_server.recv(8))[0]).decode(
"utf-8"))
return responseJson['report_'+status]
return responseJson['report_' + status]
def end(self):
"""
@ -140,10 +125,30 @@ class Client(Thread):
print("end communication!")
def login(self, user, password):
with socket.socket(socket.AF_INET, socket.SOCK_STREAM, socket.IPPROTO_TCP) as socket_to_server:
socket_to_server.connect((self.ip, self.port))
request = dict()
request['action'] = 'login'
request['user'] = user
request['password'] = password
full_request = generate_request(request)
socket_to_server.sendall(full_request)
responseJson = JSONDecoder().decode(
read_bytes(socket_to_server, struct.unpack('!Q', socket_to_server.recv(8))[0]).decode(
"utf-8"))
if responseJson['login'] not in ['用户名错误,登录失败', '密码错误,登录失败']:
self.cookie = responseJson['login']
return responseJson['login']
def run(self) -> None:
print(self.login('2', '2'))
print(self.report_status('free'))
print(self.crawling(input("word:"), pages_start=3, pages_end=4)) # [3,4)
self.report_status('free')
print(self.crawling(input("word:"), pages_start=1, pages_end=4)) # [3,4)
self.end()
@ -151,3 +156,7 @@ download_task = Client(ip, port)
download_task.daemon = True
download_task.start()
download_task.join()
server = Server(7777)
server.daemon = True
server.start()
server.join()

@ -16,35 +16,6 @@ class global_var:
free_spiders = []
current_user_info: list[CUI] = []
def exists(self, cookie):
for i in self.current_user_info:
if i.cookie == cookie:
return True
return False
def get_by_cookie(self, cookie):
for i in self.current_user_info:
if i.cookie == cookie:
return i
return None
def delete_user(self, cookie):
i = self.get_by_cookie(cookie)
self.current_user_info.remove(i)
# 对于每个全局变量都需要定义get_value和set_value接口
def add_free_spider(spider_info):
global_var.free_spiders.append(spider_info)
def get_free_spiders():
return global_var.free_spiders
def delete_spider_by_id(spider_info):
global_var.free_spiders.remove(spider_info)
def get_free_sockets() -> tuple[socket.socket]:
fs: list[socket.socket] = []
@ -54,6 +25,13 @@ def get_free_sockets() -> tuple[socket.socket]:
return tuple(fs)
def exists(cookie):
for i in global_var.current_user_info:
if i.cookie == cookie:
return True
return False
def add_user(user_name, login_time, login_state, state, cookie, st=None):
global_var.current_user_info.append(CUI(user_name, login_time, login_state, state, cookie, st))
@ -62,4 +40,15 @@ def set_state_socket(cookie, state):
for i in global_var.current_user_info:
if i.cookie == cookie:
i.state = state
break
break
def get_by_cookie(cookie):
for i in global_var.current_user_info:
if i.cookie == cookie:
return i
return None
def delete_user(cookie):
i = global_var.get_by_cookie(cookie)
global_var.current_user_info.remove(i)

@ -43,6 +43,21 @@ def register(u_name, u_pwd):
print(e)
def get_now():
try:
conn = mysql_conn()
cur = conn.cursor()
select_sql = f'select now()'
cur.execute(select_sql)
res = cur.fetchone()
# 关闭连接
cur.close()
conn.close()
return res[0]
except Exception as e:
print(e)
def login(u_name, u_pwd, st):
s1 = sha1()
s1.update(u_pwd.encode())
@ -61,9 +76,9 @@ def login(u_name, u_pwd, st):
m_pwd = res[0]
if m_pwd == sha_pwd:
# info = '用户' + u_name + '登录成功'
info = cookie.Cookie(u_name, 'time', 'true').generate_cookie()
config.add_user(u_name, 'time', 'true', 'busy', info, st)
conn.commit()
time = str(get_now())
info = cookie.Cookie(u_name, time, 'true').generate_cookie()
config.add_user(u_name, time, 'true', 'busy', info, st)
else:
info = '密码错误,登录失败'
# 关闭连接
@ -102,9 +117,9 @@ def get_last_crawl_id(table_name: str) -> int:
last_crawl_id = int(last_crawl_id_res[0])
cur.close()
conn.close()
return last_crawl_id
except Exception as e:
print(e)
return last_crawl_id
def drop_table(table_name: str):
@ -148,12 +163,12 @@ def create_crawl_result_table(table_name: str):
def create_user_info(table_name: str = 'user_info'):
create_sql = f'create table if not exists {table_name} (' \
f'id int primary key not null auto_increment,' \
f'create_time timestamp not null default now(),' \
f'user_name varchar(100),' \
f'user_password varchar(200),' \
f'login_state boolean default false' \
f')'
f'id int primary key not null auto_increment,' \
f'create_time timestamp not null default now(),' \
f'user_name varchar(100),' \
f'user_password varchar(200),' \
f'login_state boolean default false' \
f')'
create_table(create_sql)
@ -175,8 +190,9 @@ def write_result2database(res: list, table_name: str, last_crawl_id: int):
if __name__ == '__main__':
create_crawl_result_table('table_name')
print(write_result2database(['name', 'college', 'major', 'paper'], "table_name", last_crawl_id=0))
get_now()
# create_crawl_result_table('table_name')
# print(write_result2database(['name', 'college', 'major', 'paper'], "table_name", last_crawl_id=0))
pass
'''
u_name = input('请输入用户名')

@ -1,5 +1,8 @@
import csv
import threading
import socket
from json import JSONEncoder, JSONDecoder
import struct
import dcs.tests.config
from msedge.selenium_tools import Edge
@ -80,6 +83,33 @@ def write2csv(papers: list, file_name='./paper_author.csv'):
f_papers_authors.close()
def read_bytes(s: 'socket.socket', size: 'int') -> 'bytes':
"""
从socket读取size个字节
:param s:套接字
:param size:要读取的大小
:return:读取的字节数在遇到套接字关闭的情况下返回的数据的长度可能小于 size
"""
data = ''.encode('utf-8')
while len(data) < size:
rsp_data = s.recv(size - len(data))
data += rsp_data
if len(rsp_data) == 0:
break
return data
def generate_request(request) -> 'bytes':
"""
根据传入的dict生成请求
请求包含 8字节头长度+头数据
:param request: dict
:return: bytes 请求数据
"""
request_bytes = JSONEncoder().encode(request).encode("utf-8")
return struct.pack("!Q", len(request_bytes)) + request_bytes
class Spider(threading.Thread):
def __init__(self, word: str, pages_start=1, pages_end=1):
super().__init__()
@ -89,12 +119,34 @@ class Spider(threading.Thread):
self.pages_end = pages_end
pass
def crawling(self, word: str, pages_start: int, pages_end: int, client_socket: socket.socket):
with socket.socket(socket.AF_INET, socket.SOCK_STREAM, socket.IPPROTO_TCP) as socket_to_server:
socket_to_server.connect((self.ip, self.port))
request = dict()
request['action'] = 'crawl zhiwang'
request['word'] = word
request['pages_start'] = str(pages_start)
request['pages_end'] = str(pages_end)
full_request = generate_request(request)
socket_to_server.sendall(full_request)
responseJson = JSONDecoder().decode(
read_bytes(socket_to_server, struct.unpack('!Q', socket_to_server.recv(8))[0]).decode(
"utf-8"))
return responseJson['crawl zhiwang']
def distribute_spiders(self):
free_sockets = dcs.tests.config.get_free_sockets()
qt = (self.pages_end - self.pages_start) // (len(free_sockets)+1) // len(free_sockets)
for st in range(len(free_sockets)):
request_map = {'action': 'crawl_zhiwang', 'pages_start': f'{st}', 'pages_end': f'{st+qt}'}
self.pages_start = st + qt
print(request_map)
print(st, self.crawling(self.word, st, st+qt, free_sockets[st]))
'''
r = RequestHandler(self, free_sockets[st], request_map)
r.start()

Binary file not shown.
Loading…
Cancel
Save