diff --git a/dcs/dcs.log b/dcs/dcs.log index cda7b3d..ee05eef 100644 --- a/dcs/dcs.log +++ b/dcs/dcs.log @@ -457,3 +457,79 @@ 2022-03-31 21:16:06.700 | INFO | __main__::16 - starting the server... 2022-03-31 21:16:08.706 | INFO | dcs.tests.user_request_handler:report_state:26 - [REQUEST] report free 2022-03-31 21:16:08.707 | INFO | dcs.tests.user_request_handler:report_state:32 - [RESPONSE] report free: success marked 2b0fd361bbf0b986fbc20d989a224d66fe9cb13a +2022-03-31 21:56:03.819 | INFO | __main__::9 - reading config args... +2022-03-31 21:56:03.820 | INFO | __main__::16 - starting the server... +2022-03-31 21:56:06.132 | INFO | dcs.tests.user_request_handler:report_state:26 - [REQUEST] report free +2022-03-31 21:56:06.132 | INFO | dcs.tests.user_request_handler:report_state:32 - [RESPONSE] report free: success marked 2b0fd361bbf0b986fbc20d989a224d66fe9cb13a +2022-03-31 21:56:14.117 | INFO | dcs.tests.requestHandler:crawl_zhiwang:46 - [REQUEST] crawl zhiwang +2022-03-31 21:56:14.117 | INFO | dcs.tests.spider:run:104 - crawling... +2022-03-31 21:59:43.973 | INFO | __main__::9 - reading config args... +2022-03-31 21:59:43.974 | INFO | __main__::16 - starting the server... +2022-03-31 21:59:47.613 | INFO | dcs.tests.user_request_handler:login:35 - [REQUEST] login +2022-03-31 21:59:47.643 | INFO | dcs.tests.user_request_handler:login:42 - [RESPONSE] login: None +2022-03-31 22:00:10.219 | INFO | __main__::9 - reading config args... +2022-03-31 22:00:10.220 | INFO | __main__::16 - starting the server... +2022-03-31 22:00:13.984 | INFO | dcs.tests.user_request_handler:login:35 - [REQUEST] login +2022-03-31 22:00:14.029 | INFO | dcs.tests.user_request_handler:login:42 - [RESPONSE] login: 6b95f1badcff627f56e1d1b2b436a66e9d668179 +2022-03-31 22:00:22.738 | INFO | dcs.tests.user_request_handler:login:35 - [REQUEST] login +2022-03-31 22:00:22.779 | INFO | dcs.tests.user_request_handler:login:42 - [RESPONSE] login: 2a17833fb66e33e37c5b893f4d11054ba660f39a +2022-03-31 22:00:25.994 | INFO | dcs.tests.user_request_handler:login:35 - [REQUEST] login +2022-03-31 22:00:26.023 | INFO | dcs.tests.user_request_handler:login:42 - [RESPONSE] login: 012f698322e6cd67d75c8bc91f6a9058f9262337 +2022-04-01 08:15:26.267 | INFO | __main__::9 - reading config args... +2022-04-01 08:15:26.268 | INFO | __main__::16 - starting the server... +2022-04-01 08:15:39.570 | INFO | dcs.tests.user_request_handler:login:35 - [REQUEST] login +2022-04-01 08:15:39.594 | INFO | dcs.tests.user_request_handler:login:42 - [RESPONSE] login: 7afecad3195f6b790418127316607238198b8193 +2022-04-01 08:16:31.391 | INFO | __main__::9 - reading config args... +2022-04-01 08:16:31.394 | INFO | __main__::16 - starting the server... +2022-04-01 08:16:35.093 | INFO | dcs.tests.user_request_handler:login:35 - [REQUEST] login +2022-04-01 08:16:35.100 | INFO | dcs.tests.user_request_handler:login:42 - [RESPONSE] login: 23eb3a72fb6e9eac029ed47356e4eff0e394768f +2022-04-01 08:17:07.484 | INFO | __main__::9 - reading config args... +2022-04-01 08:17:07.487 | INFO | __main__::16 - starting the server... +2022-04-01 08:17:26.291 | INFO | dcs.tests.user_request_handler:login:35 - [REQUEST] login +2022-04-01 08:19:48.255 | INFO | dcs.tests.user_request_handler:login:42 - [RESPONSE] login: 9825b67537b1e2a589ff2570f033d74f33fbaf84 +2022-04-01 08:29:59.684 | INFO | __main__::9 - reading config args... +2022-04-01 08:29:59.685 | INFO | __main__::16 - starting the server... +2022-04-01 08:30:03.578 | INFO | dcs.tests.user_request_handler:report_state:26 - [REQUEST] report free +2022-04-01 08:30:03.578 | INFO | dcs.tests.user_request_handler:report_state:32 - [RESPONSE] report free: success marked 2b0fd361bbf0b986fbc20d989a224d66fe9cb13a +2022-04-01 08:30:03.579 | INFO | dcs.tests.user_request_handler:login:35 - [REQUEST] login +2022-04-01 08:30:03.618 | INFO | dcs.tests.user_request_handler:login:42 - [RESPONSE] login: 93f0363797d703e85fd8b7f216b2cc3cd33e234d +2022-04-01 08:30:03.618 | INFO | dcs.tests.user_request_handler:report_state:26 - [REQUEST] report free +2022-04-01 08:30:03.619 | INFO | dcs.tests.user_request_handler:report_state:32 - [RESPONSE] report free: success marked 2b0fd361bbf0b986fbc20d989a224d66fe9cb13a +2022-04-01 08:30:03.619 | INFO | dcs.tests.server:run:36 - [REQUEST] end +2022-04-01 08:30:03.620 | WARNING | dcs.tests.server:run:37 - communication over! +2022-04-01 08:30:03.620 | WARNING | __main__::22 - Overing... +2022-04-01 08:41:16.250 | INFO | __main__::9 - reading config args... +2022-04-01 08:41:16.251 | INFO | __main__::16 - starting the server... +2022-04-01 08:41:19.564 | INFO | dcs.tests.user_request_handler:login:35 - [REQUEST] login +2022-04-01 08:41:19.578 | INFO | dcs.tests.user_request_handler:login:42 - [RESPONSE] login: 40456734e87596bb849caa4554a2283a27b6e871 +2022-04-01 08:41:19.579 | INFO | dcs.tests.user_request_handler:report_state:26 - [REQUEST] report free +2022-04-01 08:41:19.580 | INFO | dcs.tests.user_request_handler:report_state:32 - [RESPONSE] report free: success marked 40456734e87596bb849caa4554a2283a27b6e871 +2022-04-01 08:41:19.581 | INFO | dcs.tests.user_request_handler:report_state:26 - [REQUEST] report free +2022-04-01 08:41:19.581 | INFO | dcs.tests.user_request_handler:report_state:32 - [RESPONSE] report free: success marked 40456734e87596bb849caa4554a2283a27b6e871 +2022-04-01 08:41:19.581 | INFO | dcs.tests.server:run:36 - [REQUEST] end +2022-04-01 08:41:19.582 | WARNING | dcs.tests.server:run:37 - communication over! +2022-04-01 08:41:19.582 | WARNING | __main__::22 - Overing... +2022-04-01 08:42:10.111 | INFO | __main__::9 - reading config args... +2022-04-01 08:42:10.112 | INFO | __main__::16 - starting the server... +2022-04-01 08:42:12.631 | INFO | dcs.tests.user_request_handler:login:35 - [REQUEST] login +2022-04-01 08:42:12.659 | INFO | dcs.tests.user_request_handler:login:42 - [RESPONSE] login: 3852e55acf1b5641e3d67991d5d104e8e82d66c2 +2022-04-01 08:42:12.660 | INFO | dcs.tests.user_request_handler:report_state:26 - [REQUEST] report free +2022-04-01 08:42:12.660 | INFO | dcs.tests.user_request_handler:report_state:32 - [RESPONSE] report free: success marked 3852e55acf1b5641e3d67991d5d104e8e82d66c2 +2022-04-01 08:42:24.187 | INFO | dcs.tests.requestHandler:crawl_zhiwang:46 - [REQUEST] crawl zhiwang +2022-04-01 08:42:24.188 | INFO | dcs.tests.spider:run:154 - crawling... +2022-04-01 08:46:30.210 | INFO | __main__::9 - reading config args... +2022-04-01 08:46:30.210 | INFO | __main__::16 - starting the server... +2022-04-01 08:46:32.912 | INFO | dcs.tests.user_request_handler:login:35 - [REQUEST] login +2022-04-01 08:46:32.941 | INFO | dcs.tests.user_request_handler:login:42 - [RESPONSE] login: 91c17091bf6ff1855e24152606b9ee73fee41059 +2022-04-01 08:46:32.942 | INFO | dcs.tests.user_request_handler:report_state:26 - [REQUEST] report free +2022-04-01 08:46:32.943 | INFO | dcs.tests.user_request_handler:report_state:32 - [RESPONSE] report free: success marked 91c17091bf6ff1855e24152606b9ee73fee41059 +2022-04-01 08:46:42.426 | INFO | dcs.tests.requestHandler:crawl_zhiwang:46 - [REQUEST] crawl zhiwang +2022-04-01 08:46:42.426 | INFO | dcs.tests.spider:run:155 - crawling... +2022-04-08 20:27:02.331 | INFO | __main__::9 - reading config args... +2022-04-08 20:27:02.333 | INFO | __main__::16 - starting the server... +2022-04-08 20:27:13.717 | INFO | dcs.tests.user_request_handler:login:35 - [REQUEST] login +2022-04-08 20:27:13.753 | INFO | dcs.tests.user_request_handler:login:42 - [RESPONSE] login: 5bc37fbdcb59701ad127b0a76920326a77a882ba +2022-04-08 20:27:13.754 | INFO | dcs.tests.user_request_handler:report_state:26 - [REQUEST] report free +2022-04-08 20:27:13.754 | INFO | dcs.tests.user_request_handler:report_state:32 - [RESPONSE] report free: success marked 5bc37fbdcb59701ad127b0a76920326a77a882ba +2022-04-08 20:27:20.484 | INFO | dcs.tests.requestHandler:crawl_zhiwang:46 - [REQUEST] crawl zhiwang +2022-04-08 20:27:20.484 | INFO | dcs.tests.spider:run:156 - crawling... diff --git a/dcs/tests/client.py b/dcs/tests/client.py index 69e37b1..65bfc76 100644 --- a/dcs/tests/client.py +++ b/dcs/tests/client.py @@ -4,6 +4,7 @@ from threading import Thread import socket from json import JSONEncoder, JSONDecoder import sys +from dcs.tests.server import Server # -------------------------------配置-------------------------------------------- # ------------------------------config-------------------------------------------- @@ -43,14 +44,15 @@ def generate_request(request) -> 'bytes': class Client(Thread): - def __init__(self, ip: str, port: int) -> None: + def __init__(self, server_ip: str, server_port: int) -> None: """ - :param ip: 服务器IP - :param port: 服务器端口 + :param server_ip: 服务器IP + :param server_port: 服务器端口 """ super().__init__() - self.ip = ip - self.port = port + self.ip = server_ip + self.port = server_port + self.cookie = None def test(self): with socket.socket(socket.AF_INET, socket.SOCK_STREAM, socket.IPPROTO_TCP) as socket_to_server: @@ -68,24 +70,6 @@ class Client(Thread): return responseJson['test'] - def translate(self, word: str): - with socket.socket(socket.AF_INET, socket.SOCK_STREAM, socket.IPPROTO_TCP) as socket_to_server: - socket_to_server.connect((self.ip, self.port)) - request = dict() - request['action'] = 'translate' - request['word'] = word - request['cookie'] = '2b0fd361bbf0b986fbc20d989a224d66fe9cb13a' - - full_request = generate_request(request) - - socket_to_server.sendall(full_request) - - responseJson = JSONDecoder().decode( - read_bytes(socket_to_server, struct.unpack('!Q', socket_to_server.recv(8))[0]).decode( - "utf-8")) - - return responseJson['translate'] - def crawling(self, word: str, pages_start: int, pages_end: int): with socket.socket(socket.AF_INET, socket.SOCK_STREAM, socket.IPPROTO_TCP) as socket_to_server: socket_to_server.connect((self.ip, self.port)) @@ -94,6 +78,7 @@ class Client(Thread): request['word'] = word request['pages_start'] = str(pages_start) request['pages_end'] = str(pages_end) + request['cookie'] = '2b0fd361bbf0b986fbc20d989a224d66fe9cb13a' full_request = generate_request(request) @@ -112,7 +97,7 @@ class Client(Thread): request = dict() request['action'] = 'report_' + status request['spider_info'] = (ip, port) - request['cookie'] = '2b0fd361bbf0b986fbc20d989a224d66fe9cb13a' + request['cookie'] = self.cookie full_request = generate_request(request) @@ -122,7 +107,7 @@ class Client(Thread): read_bytes(socket_to_server, struct.unpack('!Q', socket_to_server.recv(8))[0]).decode( "utf-8")) - return responseJson['report_'+status] + return responseJson['report_' + status] def end(self): """ @@ -140,10 +125,30 @@ class Client(Thread): print("end communication!") + def login(self, user, password): + with socket.socket(socket.AF_INET, socket.SOCK_STREAM, socket.IPPROTO_TCP) as socket_to_server: + socket_to_server.connect((self.ip, self.port)) + request = dict() + request['action'] = 'login' + request['user'] = user + request['password'] = password + + full_request = generate_request(request) + + socket_to_server.sendall(full_request) + + responseJson = JSONDecoder().decode( + read_bytes(socket_to_server, struct.unpack('!Q', socket_to_server.recv(8))[0]).decode( + "utf-8")) + + if responseJson['login'] not in ['用户名错误,登录失败', '密码错误,登录失败']: + self.cookie = responseJson['login'] + return responseJson['login'] + def run(self) -> None: + print(self.login('2', '2')) print(self.report_status('free')) - print(self.crawling(input("word:"), pages_start=3, pages_end=4)) # [3,4) - self.report_status('free') + print(self.crawling(input("word:"), pages_start=1, pages_end=4)) # [3,4) self.end() @@ -151,3 +156,7 @@ download_task = Client(ip, port) download_task.daemon = True download_task.start() download_task.join() +server = Server(7777) +server.daemon = True +server.start() +server.join() diff --git a/dcs/tests/config.py b/dcs/tests/config.py index b722742..88815e1 100644 --- a/dcs/tests/config.py +++ b/dcs/tests/config.py @@ -16,35 +16,6 @@ class global_var: free_spiders = [] current_user_info: list[CUI] = [] - def exists(self, cookie): - for i in self.current_user_info: - if i.cookie == cookie: - return True - return False - - def get_by_cookie(self, cookie): - for i in self.current_user_info: - if i.cookie == cookie: - return i - return None - - def delete_user(self, cookie): - i = self.get_by_cookie(cookie) - self.current_user_info.remove(i) - - -# 对于每个全局变量,都需要定义get_value和set_value接口 -def add_free_spider(spider_info): - global_var.free_spiders.append(spider_info) - - -def get_free_spiders(): - return global_var.free_spiders - - -def delete_spider_by_id(spider_info): - global_var.free_spiders.remove(spider_info) - def get_free_sockets() -> tuple[socket.socket]: fs: list[socket.socket] = [] @@ -54,6 +25,13 @@ def get_free_sockets() -> tuple[socket.socket]: return tuple(fs) +def exists(cookie): + for i in global_var.current_user_info: + if i.cookie == cookie: + return True + return False + + def add_user(user_name, login_time, login_state, state, cookie, st=None): global_var.current_user_info.append(CUI(user_name, login_time, login_state, state, cookie, st)) @@ -62,4 +40,15 @@ def set_state_socket(cookie, state): for i in global_var.current_user_info: if i.cookie == cookie: i.state = state - break \ No newline at end of file + break + + +def get_by_cookie(cookie): + for i in global_var.current_user_info: + if i.cookie == cookie: + return i + return None + +def delete_user(cookie): + i = global_var.get_by_cookie(cookie) + global_var.current_user_info.remove(i) \ No newline at end of file diff --git a/dcs/tests/database.py b/dcs/tests/database.py index 580769c..d4ce33e 100644 --- a/dcs/tests/database.py +++ b/dcs/tests/database.py @@ -43,6 +43,21 @@ def register(u_name, u_pwd): print(e) +def get_now(): + try: + conn = mysql_conn() + cur = conn.cursor() + select_sql = f'select now()' + cur.execute(select_sql) + res = cur.fetchone() + # 关闭连接 + cur.close() + conn.close() + return res[0] + except Exception as e: + print(e) + + def login(u_name, u_pwd, st): s1 = sha1() s1.update(u_pwd.encode()) @@ -61,9 +76,9 @@ def login(u_name, u_pwd, st): m_pwd = res[0] if m_pwd == sha_pwd: # info = '用户' + u_name + '登录成功' - info = cookie.Cookie(u_name, 'time', 'true').generate_cookie() - config.add_user(u_name, 'time', 'true', 'busy', info, st) - conn.commit() + time = str(get_now()) + info = cookie.Cookie(u_name, time, 'true').generate_cookie() + config.add_user(u_name, time, 'true', 'busy', info, st) else: info = '密码错误,登录失败' # 关闭连接 @@ -102,9 +117,9 @@ def get_last_crawl_id(table_name: str) -> int: last_crawl_id = int(last_crawl_id_res[0]) cur.close() conn.close() + return last_crawl_id except Exception as e: print(e) - return last_crawl_id def drop_table(table_name: str): @@ -148,12 +163,12 @@ def create_crawl_result_table(table_name: str): def create_user_info(table_name: str = 'user_info'): create_sql = f'create table if not exists {table_name} (' \ - f'id int primary key not null auto_increment,' \ - f'create_time timestamp not null default now(),' \ - f'user_name varchar(100),' \ - f'user_password varchar(200),' \ - f'login_state boolean default false' \ - f')' + f'id int primary key not null auto_increment,' \ + f'create_time timestamp not null default now(),' \ + f'user_name varchar(100),' \ + f'user_password varchar(200),' \ + f'login_state boolean default false' \ + f')' create_table(create_sql) @@ -175,8 +190,9 @@ def write_result2database(res: list, table_name: str, last_crawl_id: int): if __name__ == '__main__': - create_crawl_result_table('table_name') - print(write_result2database(['name', 'college', 'major', 'paper'], "table_name", last_crawl_id=0)) + get_now() + # create_crawl_result_table('table_name') + # print(write_result2database(['name', 'college', 'major', 'paper'], "table_name", last_crawl_id=0)) pass ''' u_name = input('请输入用户名') diff --git a/dcs/tests/spider.py b/dcs/tests/spider.py index a190d79..2f4bb63 100644 --- a/dcs/tests/spider.py +++ b/dcs/tests/spider.py @@ -1,5 +1,8 @@ import csv import threading +import socket +from json import JSONEncoder, JSONDecoder +import struct import dcs.tests.config from msedge.selenium_tools import Edge @@ -80,6 +83,33 @@ def write2csv(papers: list, file_name='./paper_author.csv'): f_papers_authors.close() +def read_bytes(s: 'socket.socket', size: 'int') -> 'bytes': + """ + 从socket读取size个字节 + :param s:套接字 + :param size:要读取的大小 + :return:读取的字节数,在遇到套接字关闭的情况下,返回的数据的长度可能小于 size + """ + data = ''.encode('utf-8') + while len(data) < size: + rsp_data = s.recv(size - len(data)) + data += rsp_data + if len(rsp_data) == 0: + break + return data + + +def generate_request(request) -> 'bytes': + """ + 根据传入的dict生成请求 + 请求包含 8字节头长度+头数据 + :param request: dict + :return: bytes 请求数据 + """ + request_bytes = JSONEncoder().encode(request).encode("utf-8") + return struct.pack("!Q", len(request_bytes)) + request_bytes + + class Spider(threading.Thread): def __init__(self, word: str, pages_start=1, pages_end=1): super().__init__() @@ -89,12 +119,34 @@ class Spider(threading.Thread): self.pages_end = pages_end pass + def crawling(self, word: str, pages_start: int, pages_end: int, client_socket: socket.socket): + + with socket.socket(socket.AF_INET, socket.SOCK_STREAM, socket.IPPROTO_TCP) as socket_to_server: + socket_to_server.connect((self.ip, self.port)) + request = dict() + request['action'] = 'crawl zhiwang' + request['word'] = word + request['pages_start'] = str(pages_start) + request['pages_end'] = str(pages_end) + + full_request = generate_request(request) + + socket_to_server.sendall(full_request) + + responseJson = JSONDecoder().decode( + read_bytes(socket_to_server, struct.unpack('!Q', socket_to_server.recv(8))[0]).decode( + "utf-8")) + + return responseJson['crawl zhiwang'] + def distribute_spiders(self): free_sockets = dcs.tests.config.get_free_sockets() qt = (self.pages_end - self.pages_start) // (len(free_sockets)+1) // len(free_sockets) for st in range(len(free_sockets)): request_map = {'action': 'crawl_zhiwang', 'pages_start': f'{st}', 'pages_end': f'{st+qt}'} self.pages_start = st + qt + print(request_map) + print(st, self.crawling(self.word, st, st+qt, free_sockets[st])) ''' r = RequestHandler(self, free_sockets[st], request_map) r.start() diff --git a/docs/pictures/项目结构图.pdf b/docs/pictures/项目结构图.pdf new file mode 100644 index 0000000..aa374cc Binary files /dev/null and b/docs/pictures/项目结构图.pdf differ