diff --git a/dcs/dcs.log b/dcs/dcs.log index 0f66d06..cda7b3d 100644 --- a/dcs/dcs.log +++ b/dcs/dcs.log @@ -384,3 +384,76 @@ 2022-03-30 21:17:57.790 | INFO | __main__::16 - starting the server... 2022-03-30 21:18:01.353 | INFO | dcs.tests.user_request_handler:login:37 - [REQUEST] login 2022-03-30 21:18:01.380 | INFO | dcs.tests.user_request_handler:login:44 - [RESPONSE] login: 用户名错误,登录失败 +2022-03-30 21:49:19.595 | INFO | __main__::9 - reading config args... +2022-03-30 21:49:19.595 | INFO | __main__::16 - starting the server... +2022-03-30 21:49:26.745 | INFO | dcs.tests.user_request_handler:report_state:27 - [REQUEST] report free +2022-03-30 21:49:26.745 | INFO | dcs.tests.user_request_handler:report_state:34 - [RESPONSE] report free: success marked ['127.0.0.1', 7777] +2022-03-30 21:49:29.915 | INFO | dcs.tests.requestHandler:crawl_zhiwang:48 - [REQUEST] crawl zhiwang +2022-03-30 21:49:29.916 | INFO | dcs.tests.spider:run:101 - crawling... +2022-03-30 21:51:23.563 | INFO | __main__::9 - reading config args... +2022-03-30 21:51:23.564 | INFO | __main__::16 - starting the server... +2022-03-30 21:51:25.203 | INFO | dcs.tests.user_request_handler:report_state:27 - [REQUEST] report free +2022-03-30 21:51:25.204 | INFO | dcs.tests.user_request_handler:report_state:34 - [RESPONSE] report free: success marked ['127.0.0.1', 7777] +2022-03-30 21:51:27.382 | INFO | dcs.tests.requestHandler:crawl_zhiwang:48 - [REQUEST] crawl zhiwang +2022-03-30 21:51:27.383 | INFO | dcs.tests.spider:run:101 - crawling... +2022-03-30 21:51:41.782 | INFO | dcs.tests.requestHandler:crawl_zhiwang:61 - [RESPONSE] crawl zhiwang: success +2022-03-30 21:51:41.783 | INFO | dcs.tests.user_request_handler:report_state:27 - [REQUEST] report free +2022-03-30 21:51:41.784 | INFO | dcs.tests.user_request_handler:report_state:34 - [RESPONSE] report free: success marked ['127.0.0.1', 7777] +2022-03-30 21:51:41.784 | INFO | dcs.tests.server:run:36 - [REQUEST] end +2022-03-30 21:51:41.785 | WARNING | dcs.tests.server:run:37 - communication over! +2022-03-30 21:51:41.786 | WARNING | __main__::22 - Overing... +2022-03-30 21:54:57.283 | INFO | __main__::9 - reading config args... +2022-03-30 21:54:57.283 | INFO | __main__::16 - starting the server... +2022-03-30 21:54:59.070 | INFO | dcs.tests.user_request_handler:report_state:27 - [REQUEST] report free +2022-03-30 21:54:59.071 | INFO | dcs.tests.user_request_handler:report_state:34 - [RESPONSE] report free: success marked ['127.0.0.1', 7777] +2022-03-30 21:55:01.139 | INFO | dcs.tests.requestHandler:crawl_zhiwang:48 - [REQUEST] crawl zhiwang +2022-03-30 21:55:01.140 | INFO | dcs.tests.spider:run:102 - crawling... +2022-03-30 21:55:16.140 | INFO | dcs.tests.requestHandler:crawl_zhiwang:61 - [RESPONSE] crawl zhiwang: success +2022-03-30 21:55:16.142 | INFO | dcs.tests.user_request_handler:report_state:27 - [REQUEST] report free +2022-03-30 21:55:16.142 | INFO | dcs.tests.user_request_handler:report_state:34 - [RESPONSE] report free: success marked ['127.0.0.1', 7777] +2022-03-30 21:55:16.143 | INFO | dcs.tests.server:run:36 - [REQUEST] end +2022-03-30 21:55:16.144 | WARNING | dcs.tests.server:run:37 - communication over! +2022-03-30 21:55:16.144 | WARNING | __main__::22 - Overing... +2022-03-30 21:55:56.069 | INFO | __main__::9 - reading config args... +2022-03-30 21:55:56.069 | INFO | __main__::16 - starting the server... +2022-03-30 21:55:57.783 | INFO | dcs.tests.user_request_handler:report_state:27 - [REQUEST] report free +2022-03-30 21:55:57.784 | INFO | dcs.tests.user_request_handler:report_state:34 - [RESPONSE] report free: success marked ['127.0.0.1', 7777] +2022-03-30 21:56:02.438 | INFO | dcs.tests.requestHandler:crawl_zhiwang:48 - [REQUEST] crawl zhiwang +2022-03-30 21:56:02.438 | INFO | dcs.tests.spider:run:101 - crawling... +2022-03-31 21:05:53.263 | INFO | __main__::9 - reading config args... +2022-03-31 21:05:53.264 | INFO | __main__::16 - starting the server... +2022-03-31 21:07:06.282 | INFO | dcs.tests.user_request_handler:register:46 - [REQUEST] register +2022-03-31 21:07:06.316 | INFO | dcs.tests.user_request_handler:register:53 - [RESPONSE] register: 注册成功 +2022-03-31 21:07:19.009 | INFO | dcs.tests.user_request_handler:login:36 - [REQUEST] login +2022-03-31 21:07:19.026 | INFO | dcs.tests.user_request_handler:login:43 - [RESPONSE] login: None +2022-03-31 21:09:18.747 | INFO | dcs.tests.user_request_handler:login:36 - [REQUEST] login +2022-03-31 21:09:18.765 | INFO | dcs.tests.user_request_handler:login:43 - [RESPONSE] login: None +2022-03-31 21:09:28.806 | INFO | __main__::9 - reading config args... +2022-03-31 21:09:28.806 | INFO | __main__::16 - starting the server... +2022-03-31 21:09:31.834 | INFO | dcs.tests.user_request_handler:login:36 - [REQUEST] login +2022-03-31 21:09:31.849 | INFO | dcs.tests.user_request_handler:login:43 - [RESPONSE] login: None +2022-03-31 21:11:07.135 | INFO | dcs.tests.user_request_handler:login:36 - [REQUEST] login +2022-03-31 21:11:07.162 | INFO | dcs.tests.user_request_handler:login:43 - [RESPONSE] login: None +2022-03-31 21:11:14.377 | INFO | __main__::9 - reading config args... +2022-03-31 21:11:14.377 | INFO | __main__::16 - starting the server... +2022-03-31 21:11:18.145 | INFO | dcs.tests.user_request_handler:login:36 - [REQUEST] login +2022-03-31 21:11:18.170 | INFO | dcs.tests.user_request_handler:login:43 - [RESPONSE] login: None +2022-03-31 21:11:41.368 | INFO | __main__::9 - reading config args... +2022-03-31 21:11:41.368 | INFO | __main__::16 - starting the server... +2022-03-31 21:11:44.937 | INFO | dcs.tests.user_request_handler:login:36 - [REQUEST] login +2022-03-31 21:11:44.964 | INFO | dcs.tests.user_request_handler:login:43 - [RESPONSE] login: None +2022-03-31 21:12:17.472 | INFO | __main__::9 - reading config args... +2022-03-31 21:12:17.472 | INFO | __main__::16 - starting the server... +2022-03-31 21:12:20.357 | INFO | dcs.tests.user_request_handler:login:36 - [REQUEST] login +2022-03-31 21:12:20.384 | INFO | dcs.tests.user_request_handler:login:43 - [RESPONSE] login: 2b0fd361bbf0b986fbc20d989a224d66fe9cb13a +2022-03-31 21:12:37.815 | INFO | dcs.tests.user_request_handler:report_state:26 - [REQUEST] report free +2022-03-31 21:13:06.226 | INFO | __main__::9 - reading config args... +2022-03-31 21:13:06.227 | INFO | __main__::16 - starting the server... +2022-03-31 21:13:08.442 | INFO | dcs.tests.user_request_handler:report_state:26 - [REQUEST] report free +2022-03-31 21:15:16.895 | INFO | __main__::9 - reading config args... +2022-03-31 21:15:16.896 | INFO | __main__::16 - starting the server... +2022-03-31 21:15:19.142 | INFO | dcs.tests.user_request_handler:report_state:26 - [REQUEST] report free +2022-03-31 21:16:06.700 | INFO | __main__::9 - reading config args... +2022-03-31 21:16:06.700 | INFO | __main__::16 - starting the server... +2022-03-31 21:16:08.706 | INFO | dcs.tests.user_request_handler:report_state:26 - [REQUEST] report free +2022-03-31 21:16:08.707 | INFO | dcs.tests.user_request_handler:report_state:32 - [RESPONSE] report free: success marked 2b0fd361bbf0b986fbc20d989a224d66fe9cb13a diff --git a/dcs/tests/client.py b/dcs/tests/client.py index 033c0a9..69e37b1 100644 --- a/dcs/tests/client.py +++ b/dcs/tests/client.py @@ -74,6 +74,7 @@ class Client(Thread): request = dict() request['action'] = 'translate' request['word'] = word + request['cookie'] = '2b0fd361bbf0b986fbc20d989a224d66fe9cb13a' full_request = generate_request(request) @@ -85,12 +86,14 @@ class Client(Thread): return responseJson['translate'] - def crawling(self, word: str): + def crawling(self, word: str, pages_start: int, pages_end: int): with socket.socket(socket.AF_INET, socket.SOCK_STREAM, socket.IPPROTO_TCP) as socket_to_server: socket_to_server.connect((self.ip, self.port)) request = dict() request['action'] = 'crawl zhiwang' request['word'] = word + request['pages_start'] = str(pages_start) + request['pages_end'] = str(pages_end) full_request = generate_request(request) @@ -109,6 +112,7 @@ class Client(Thread): request = dict() request['action'] = 'report_' + status request['spider_info'] = (ip, port) + request['cookie'] = '2b0fd361bbf0b986fbc20d989a224d66fe9cb13a' full_request = generate_request(request) @@ -138,7 +142,7 @@ class Client(Thread): def run(self) -> None: print(self.report_status('free')) - print(self.crawling(input("word:"))) + print(self.crawling(input("word:"), pages_start=3, pages_end=4)) # [3,4) self.report_status('free') self.end() diff --git a/dcs/tests/client_crawl.py b/dcs/tests/client_crawl.py new file mode 100644 index 0000000..ad9bed3 --- /dev/null +++ b/dcs/tests/client_crawl.py @@ -0,0 +1,43 @@ +import os +import requests +from bs4 import BeautifulSoup +#爬虫头数据 +cookies = { + 'SINAGLOBAL': '6797875236621.702.1603159218040', + 'SUB': '_2AkMXbqMSf8NxqwJRmfkTzmnhboh1ygvEieKhMlLJJRMxHRl-yT9jqmg8tRB6PO6N_Rc_2FhPeZF2iThYO9DfkLUGpv4V', + 'SUBP': '0033WrSXqPxfM72-Ws9jqgMF55529P9D9Wh-nU-QNDs1Fu27p6nmwwiJ', + '_s_tentry': 'www.baidu.com', + 'UOR': 'www.hfut.edu.cn,widget.weibo.com,www.baidu.com', + 'Apache': '7782025452543.054.1635925669528', + 'ULV': '1635925669554:15:1:1:7782025452543.054.1635925669528:1627316870256', +} +headers = { + 'Connection': 'keep-alive', + 'Cache-Control': 'max-age=0', + 'Upgrade-Insecure-Requests': '1', + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36 SLBrowser/7.0.0.6241 SLBChan/25', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', + 'Sec-Fetch-Site': 'cross-site', + 'Sec-Fetch-Mode': 'navigate', + 'Sec-Fetch-User': '?1', + 'Sec-Fetch-Dest': 'document', + 'Accept-Language': 'zh-CN,zh;q=0.9', +} +params = ( + ('cate', 'realtimehot'), +) +#数据存储 +fo = open("./微博热搜.txt",'a',encoding="utf-8") +#获取网页 +response = requests.get('https://s.weibo.com/top/summary', headers=headers, params=params, cookies=cookies) +#解析网页 +response.encoding='utf-8' +soup = BeautifulSoup(response.text, 'html.parser') +#爬取内容 +content="#pl_top_realtimehot > table > tbody > tr > td.td-02 > a" +#清洗数据 +a=soup.select(content) +for i in range(0,len(a)): + a[i] = a[i].text + fo.write(a[i]+'\n') +fo.close() \ No newline at end of file diff --git a/dcs/tests/config.py b/dcs/tests/config.py index 184da2a..b722742 100644 --- a/dcs/tests/config.py +++ b/dcs/tests/config.py @@ -1,6 +1,36 @@ +import socket + + +class CUI: + def __init__(self, user_name, login_time, login_state, state, cookie, st=None): + self.user_name = user_name + self.login_time = login_time + self.login_state = login_state + self.state = state + self.cookie = cookie + self.socket = st + + class global_var: """需要定义全局变量的放在这里,最好定义一个初始值""" free_spiders = [] + current_user_info: list[CUI] = [] + + def exists(self, cookie): + for i in self.current_user_info: + if i.cookie == cookie: + return True + return False + + def get_by_cookie(self, cookie): + for i in self.current_user_info: + if i.cookie == cookie: + return i + return None + + def delete_user(self, cookie): + i = self.get_by_cookie(cookie) + self.current_user_info.remove(i) # 对于每个全局变量,都需要定义get_value和set_value接口 @@ -14,3 +44,22 @@ def get_free_spiders(): def delete_spider_by_id(spider_info): global_var.free_spiders.remove(spider_info) + + +def get_free_sockets() -> tuple[socket.socket]: + fs: list[socket.socket] = [] + for i in global_var.current_user_info: + if i.state == 'free': + fs.append(i.socket) + return tuple(fs) + + +def add_user(user_name, login_time, login_state, state, cookie, st=None): + global_var.current_user_info.append(CUI(user_name, login_time, login_state, state, cookie, st)) + + +def set_state_socket(cookie, state): + for i in global_var.current_user_info: + if i.cookie == cookie: + i.state = state + break \ No newline at end of file diff --git a/dcs/tests/cookie.py b/dcs/tests/cookie.py new file mode 100644 index 0000000..a8f3011 --- /dev/null +++ b/dcs/tests/cookie.py @@ -0,0 +1,18 @@ +from hashlib import * + + +class Cookie: + def __init__(self, user_name: str, login_time: str, login_state: str, cookie=None): + self.user_name = user_name + self.login_time = login_time + self.login_state = login_state + self.cookie = cookie + + def generate_cookie(self): + s1 = sha1() + s1.update(str(self.user_name+self.login_time+self.login_state).encode()) + self.cookie = s1.hexdigest() + return self.cookie + + def __str__(self): + return self.cookie diff --git a/dcs/tests/database.py b/dcs/tests/database.py index fc3c0cf..580769c 100644 --- a/dcs/tests/database.py +++ b/dcs/tests/database.py @@ -1,5 +1,7 @@ from hashlib import * import pymysql +import dcs.tests.config as config +import dcs.tests.cookie as cookie # 获取数据库连接对象 @@ -41,7 +43,7 @@ def register(u_name, u_pwd): print(e) -def login(u_name, u_pwd): +def login(u_name, u_pwd, st): s1 = sha1() s1.update(u_pwd.encode()) sha_pwd = s1.hexdigest() @@ -58,9 +60,9 @@ def login(u_name, u_pwd): # res有值,用户名正确,判断密码正确与否 m_pwd = res[0] if m_pwd == sha_pwd: - info = '用户' + u_name + '登录成功' - update_sql = f'update user_info set login_state = true where username = {u_name}' - cur.execute(update_sql) + # info = '用户' + u_name + '登录成功' + info = cookie.Cookie(u_name, 'time', 'true').generate_cookie() + config.add_user(u_name, 'time', 'true', 'busy', info, st) conn.commit() else: info = '密码错误,登录失败' diff --git a/dcs/tests/requestHandler.py b/dcs/tests/requestHandler.py index 87c60e0..f79abd1 100644 --- a/dcs/tests/requestHandler.py +++ b/dcs/tests/requestHandler.py @@ -2,8 +2,6 @@ import socket import threading import json import struct -import dcs.tests.config -import dcs.tests.database as database from loguru import logger from dcs.tests.spider import Spider from dcs.tests.user_request_handler import Urh @@ -46,7 +44,13 @@ class RequestHandler(threading.Thread): def crawl_zhiwang(self): logger.info(f"[REQUEST] crawl zhiwang") - spider = Spider(self.request_map['word']) + try: + pages_start = int(self.request_map['pages_start']) + pages_end = int(self.request_map['pages_end']) + except: + pages_start = 1 + pages_end = 1 + spider = Spider(self.request_map['word'], pages_start=pages_start, pages_end=pages_end) spider.run() response = { 'crawl zhiwang': 'success' # TODO @@ -54,36 +58,6 @@ class RequestHandler(threading.Thread): self.client_socket.sendall(generate_response(response)) logger.info(f"[RESPONSE] crawl zhiwang: {response['crawl zhiwang']}") - def report_state(self, state): - logger.info(f"[REQUEST] report free") - if self.request_map['spider_info'] not in dcs.tests.config.get_free_spiders(): - dcs.tests.config.add_free_spider(self.request_map['spider_info']) - response = { - 'report_free': 'success marked ' + str(self.request_map['spider_info']) - } - self.client_socket.sendall(generate_response(response)) - logger.info(f"[RESPONSE] report free: {response['report_free']}") - - def login(self, user, password): - logger.info(f"[REQUEST] login") - database.mysql_conn() - response = database.login(user, password) - response = { - 'login': response - } - self.client_socket.sendall(generate_response(response)) - logger.info(f"[RESPONSE] login: {response['login']}") - - def register(self, user, password): - logger.info(f"[REQUEST] register") - database.mysql_conn() - response = database.register(user, password) - response = { - 'register': response - } - self.client_socket.sendall(generate_response(response)) - logger.info(f"[RESPONSE] register: {response['register']}") - def run(self) -> None: try: if self.request_map['action'] == 'test': @@ -99,4 +73,3 @@ class RequestHandler(threading.Thread): urh.start() except: pass - diff --git a/dcs/tests/spider.py b/dcs/tests/spider.py index d97d8ae..a190d79 100644 --- a/dcs/tests/spider.py +++ b/dcs/tests/spider.py @@ -4,7 +4,6 @@ import threading import dcs.tests.config from msedge.selenium_tools import Edge from msedge.selenium_tools import EdgeOptions - from dcs.tests.zhiwang import * from loguru import logger from dcs.tests.database import write_result2database, get_last_crawl_id, create_crawl_result_table @@ -91,11 +90,15 @@ class Spider(threading.Thread): pass def distribute_spiders(self): - free_spiders = dcs.tests.config.get_free_spiders() - for sp in free_spiders: - pass - print(self.pages_start, sp) - # TODO 发布任务 + free_sockets = dcs.tests.config.get_free_sockets() + qt = (self.pages_end - self.pages_start) // (len(free_sockets)+1) // len(free_sockets) + for st in range(len(free_sockets)): + request_map = {'action': 'crawl_zhiwang', 'pages_start': f'{st}', 'pages_end': f'{st+qt}'} + self.pages_start = st + qt + ''' + r = RequestHandler(self, free_sockets[st], request_map) + r.start() + ''' def run(self) -> None: logger.info('crawling...') diff --git a/dcs/tests/user_request_handler.py b/dcs/tests/user_request_handler.py index 2d941e8..8c6cb31 100644 --- a/dcs/tests/user_request_handler.py +++ b/dcs/tests/user_request_handler.py @@ -1,9 +1,8 @@ -import threading import socket import threading import json import struct -import dcs.tests.config +import dcs.tests.config as config import dcs.tests.database as database from loguru import logger @@ -25,18 +24,17 @@ class Urh(threading.Thread): def report_state(self, state): logger.info(f"[REQUEST] report free") - if self.request_map['spider_info'] not in dcs.tests.config.get_free_spiders(): - dcs.tests.config.add_free_spider(self.request_map['spider_info']) + config.set_state_socket(self.request_map['cookie'], state) response = { - 'report_free': 'success marked ' + str(self.request_map['spider_info']) + 'report_free': 'success marked ' + str(self.request_map['cookie']) } self.client_socket.sendall(generate_response(response)) logger.info(f"[RESPONSE] report free: {response['report_free']}") - def login(self, user, password): + def login(self, user, password, st): logger.info(f"[REQUEST] login") database.mysql_conn() - response = database.login(user, password) + response = database.login(user, password, st) response = { 'login': response } @@ -57,7 +55,7 @@ class Urh(threading.Thread): if self.request_map['action'] == 'report_free': self.report_state('free') elif self.request_map['action'] == 'login': - self.login(self.request_map['user'], self.request_map['password']) + self.login(self.request_map['user'], self.request_map['password'], self.client_socket) elif self.request_map['action'] == 'register': self.register(self.request_map['user'], self.request_map['password']) else: