初步完善了服务器中存在的“当前用户信息表”,以及保存它的外部变量,同时修缮了数据库处理,以及提供了cookie认证机制,经过初步验证;此外初步实现了爬虫任务分发功能,还未验证,因为需要改写客户端文件。

master
wufayuan 3 years ago
parent 3d8e40bb5e
commit e69f4ea071

@ -384,3 +384,76 @@
2022-03-30 21:17:57.790 | INFO | __main__:<module>:16 - starting the server...
2022-03-30 21:18:01.353 | INFO | dcs.tests.user_request_handler:login:37 - [REQUEST] login
2022-03-30 21:18:01.380 | INFO | dcs.tests.user_request_handler:login:44 - [RESPONSE] login: 用户名错误,登录失败
2022-03-30 21:49:19.595 | INFO | __main__:<module>:9 - reading config args...
2022-03-30 21:49:19.595 | INFO | __main__:<module>:16 - starting the server...
2022-03-30 21:49:26.745 | INFO | dcs.tests.user_request_handler:report_state:27 - [REQUEST] report free
2022-03-30 21:49:26.745 | INFO | dcs.tests.user_request_handler:report_state:34 - [RESPONSE] report free: success marked ['127.0.0.1', 7777]
2022-03-30 21:49:29.915 | INFO | dcs.tests.requestHandler:crawl_zhiwang:48 - [REQUEST] crawl zhiwang
2022-03-30 21:49:29.916 | INFO | dcs.tests.spider:run:101 - crawling...
2022-03-30 21:51:23.563 | INFO | __main__:<module>:9 - reading config args...
2022-03-30 21:51:23.564 | INFO | __main__:<module>:16 - starting the server...
2022-03-30 21:51:25.203 | INFO | dcs.tests.user_request_handler:report_state:27 - [REQUEST] report free
2022-03-30 21:51:25.204 | INFO | dcs.tests.user_request_handler:report_state:34 - [RESPONSE] report free: success marked ['127.0.0.1', 7777]
2022-03-30 21:51:27.382 | INFO | dcs.tests.requestHandler:crawl_zhiwang:48 - [REQUEST] crawl zhiwang
2022-03-30 21:51:27.383 | INFO | dcs.tests.spider:run:101 - crawling...
2022-03-30 21:51:41.782 | INFO | dcs.tests.requestHandler:crawl_zhiwang:61 - [RESPONSE] crawl zhiwang: success
2022-03-30 21:51:41.783 | INFO | dcs.tests.user_request_handler:report_state:27 - [REQUEST] report free
2022-03-30 21:51:41.784 | INFO | dcs.tests.user_request_handler:report_state:34 - [RESPONSE] report free: success marked ['127.0.0.1', 7777]
2022-03-30 21:51:41.784 | INFO | dcs.tests.server:run:36 - [REQUEST] end
2022-03-30 21:51:41.785 | WARNING | dcs.tests.server:run:37 - communication over!
2022-03-30 21:51:41.786 | WARNING | __main__:<module>:22 - Overing...
2022-03-30 21:54:57.283 | INFO | __main__:<module>:9 - reading config args...
2022-03-30 21:54:57.283 | INFO | __main__:<module>:16 - starting the server...
2022-03-30 21:54:59.070 | INFO | dcs.tests.user_request_handler:report_state:27 - [REQUEST] report free
2022-03-30 21:54:59.071 | INFO | dcs.tests.user_request_handler:report_state:34 - [RESPONSE] report free: success marked ['127.0.0.1', 7777]
2022-03-30 21:55:01.139 | INFO | dcs.tests.requestHandler:crawl_zhiwang:48 - [REQUEST] crawl zhiwang
2022-03-30 21:55:01.140 | INFO | dcs.tests.spider:run:102 - crawling...
2022-03-30 21:55:16.140 | INFO | dcs.tests.requestHandler:crawl_zhiwang:61 - [RESPONSE] crawl zhiwang: success
2022-03-30 21:55:16.142 | INFO | dcs.tests.user_request_handler:report_state:27 - [REQUEST] report free
2022-03-30 21:55:16.142 | INFO | dcs.tests.user_request_handler:report_state:34 - [RESPONSE] report free: success marked ['127.0.0.1', 7777]
2022-03-30 21:55:16.143 | INFO | dcs.tests.server:run:36 - [REQUEST] end
2022-03-30 21:55:16.144 | WARNING | dcs.tests.server:run:37 - communication over!
2022-03-30 21:55:16.144 | WARNING | __main__:<module>:22 - Overing...
2022-03-30 21:55:56.069 | INFO | __main__:<module>:9 - reading config args...
2022-03-30 21:55:56.069 | INFO | __main__:<module>:16 - starting the server...
2022-03-30 21:55:57.783 | INFO | dcs.tests.user_request_handler:report_state:27 - [REQUEST] report free
2022-03-30 21:55:57.784 | INFO | dcs.tests.user_request_handler:report_state:34 - [RESPONSE] report free: success marked ['127.0.0.1', 7777]
2022-03-30 21:56:02.438 | INFO | dcs.tests.requestHandler:crawl_zhiwang:48 - [REQUEST] crawl zhiwang
2022-03-30 21:56:02.438 | INFO | dcs.tests.spider:run:101 - crawling...
2022-03-31 21:05:53.263 | INFO | __main__:<module>:9 - reading config args...
2022-03-31 21:05:53.264 | INFO | __main__:<module>:16 - starting the server...
2022-03-31 21:07:06.282 | INFO | dcs.tests.user_request_handler:register:46 - [REQUEST] register
2022-03-31 21:07:06.316 | INFO | dcs.tests.user_request_handler:register:53 - [RESPONSE] register: 注册成功
2022-03-31 21:07:19.009 | INFO | dcs.tests.user_request_handler:login:36 - [REQUEST] login
2022-03-31 21:07:19.026 | INFO | dcs.tests.user_request_handler:login:43 - [RESPONSE] login: None
2022-03-31 21:09:18.747 | INFO | dcs.tests.user_request_handler:login:36 - [REQUEST] login
2022-03-31 21:09:18.765 | INFO | dcs.tests.user_request_handler:login:43 - [RESPONSE] login: None
2022-03-31 21:09:28.806 | INFO | __main__:<module>:9 - reading config args...
2022-03-31 21:09:28.806 | INFO | __main__:<module>:16 - starting the server...
2022-03-31 21:09:31.834 | INFO | dcs.tests.user_request_handler:login:36 - [REQUEST] login
2022-03-31 21:09:31.849 | INFO | dcs.tests.user_request_handler:login:43 - [RESPONSE] login: None
2022-03-31 21:11:07.135 | INFO | dcs.tests.user_request_handler:login:36 - [REQUEST] login
2022-03-31 21:11:07.162 | INFO | dcs.tests.user_request_handler:login:43 - [RESPONSE] login: None
2022-03-31 21:11:14.377 | INFO | __main__:<module>:9 - reading config args...
2022-03-31 21:11:14.377 | INFO | __main__:<module>:16 - starting the server...
2022-03-31 21:11:18.145 | INFO | dcs.tests.user_request_handler:login:36 - [REQUEST] login
2022-03-31 21:11:18.170 | INFO | dcs.tests.user_request_handler:login:43 - [RESPONSE] login: None
2022-03-31 21:11:41.368 | INFO | __main__:<module>:9 - reading config args...
2022-03-31 21:11:41.368 | INFO | __main__:<module>:16 - starting the server...
2022-03-31 21:11:44.937 | INFO | dcs.tests.user_request_handler:login:36 - [REQUEST] login
2022-03-31 21:11:44.964 | INFO | dcs.tests.user_request_handler:login:43 - [RESPONSE] login: None
2022-03-31 21:12:17.472 | INFO | __main__:<module>:9 - reading config args...
2022-03-31 21:12:17.472 | INFO | __main__:<module>:16 - starting the server...
2022-03-31 21:12:20.357 | INFO | dcs.tests.user_request_handler:login:36 - [REQUEST] login
2022-03-31 21:12:20.384 | INFO | dcs.tests.user_request_handler:login:43 - [RESPONSE] login: 2b0fd361bbf0b986fbc20d989a224d66fe9cb13a
2022-03-31 21:12:37.815 | INFO | dcs.tests.user_request_handler:report_state:26 - [REQUEST] report free
2022-03-31 21:13:06.226 | INFO | __main__:<module>:9 - reading config args...
2022-03-31 21:13:06.227 | INFO | __main__:<module>:16 - starting the server...
2022-03-31 21:13:08.442 | INFO | dcs.tests.user_request_handler:report_state:26 - [REQUEST] report free
2022-03-31 21:15:16.895 | INFO | __main__:<module>:9 - reading config args...
2022-03-31 21:15:16.896 | INFO | __main__:<module>:16 - starting the server...
2022-03-31 21:15:19.142 | INFO | dcs.tests.user_request_handler:report_state:26 - [REQUEST] report free
2022-03-31 21:16:06.700 | INFO | __main__:<module>:9 - reading config args...
2022-03-31 21:16:06.700 | INFO | __main__:<module>:16 - starting the server...
2022-03-31 21:16:08.706 | INFO | dcs.tests.user_request_handler:report_state:26 - [REQUEST] report free
2022-03-31 21:16:08.707 | INFO | dcs.tests.user_request_handler:report_state:32 - [RESPONSE] report free: success marked 2b0fd361bbf0b986fbc20d989a224d66fe9cb13a

@ -74,6 +74,7 @@ class Client(Thread):
request = dict()
request['action'] = 'translate'
request['word'] = word
request['cookie'] = '2b0fd361bbf0b986fbc20d989a224d66fe9cb13a'
full_request = generate_request(request)
@ -85,12 +86,14 @@ class Client(Thread):
return responseJson['translate']
def crawling(self, word: str):
def crawling(self, word: str, pages_start: int, pages_end: int):
with socket.socket(socket.AF_INET, socket.SOCK_STREAM, socket.IPPROTO_TCP) as socket_to_server:
socket_to_server.connect((self.ip, self.port))
request = dict()
request['action'] = 'crawl zhiwang'
request['word'] = word
request['pages_start'] = str(pages_start)
request['pages_end'] = str(pages_end)
full_request = generate_request(request)
@ -109,6 +112,7 @@ class Client(Thread):
request = dict()
request['action'] = 'report_' + status
request['spider_info'] = (ip, port)
request['cookie'] = '2b0fd361bbf0b986fbc20d989a224d66fe9cb13a'
full_request = generate_request(request)
@ -138,7 +142,7 @@ class Client(Thread):
def run(self) -> None:
print(self.report_status('free'))
print(self.crawling(input("word:")))
print(self.crawling(input("word:"), pages_start=3, pages_end=4)) # [3,4)
self.report_status('free')
self.end()

@ -0,0 +1,43 @@
import os
import requests
from bs4 import BeautifulSoup
#爬虫头数据
cookies = {
'SINAGLOBAL': '6797875236621.702.1603159218040',
'SUB': '_2AkMXbqMSf8NxqwJRmfkTzmnhboh1ygvEieKhMlLJJRMxHRl-yT9jqmg8tRB6PO6N_Rc_2FhPeZF2iThYO9DfkLUGpv4V',
'SUBP': '0033WrSXqPxfM72-Ws9jqgMF55529P9D9Wh-nU-QNDs1Fu27p6nmwwiJ',
'_s_tentry': 'www.baidu.com',
'UOR': 'www.hfut.edu.cn,widget.weibo.com,www.baidu.com',
'Apache': '7782025452543.054.1635925669528',
'ULV': '1635925669554:15:1:1:7782025452543.054.1635925669528:1627316870256',
}
headers = {
'Connection': 'keep-alive',
'Cache-Control': 'max-age=0',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36 SLBrowser/7.0.0.6241 SLBChan/25',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Sec-Fetch-Site': 'cross-site',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-User': '?1',
'Sec-Fetch-Dest': 'document',
'Accept-Language': 'zh-CN,zh;q=0.9',
}
params = (
('cate', 'realtimehot'),
)
#数据存储
fo = open("./微博热搜.txt",'a',encoding="utf-8")
#获取网页
response = requests.get('https://s.weibo.com/top/summary', headers=headers, params=params, cookies=cookies)
#解析网页
response.encoding='utf-8'
soup = BeautifulSoup(response.text, 'html.parser')
#爬取内容
content="#pl_top_realtimehot > table > tbody > tr > td.td-02 > a"
#清洗数据
a=soup.select(content)
for i in range(0,len(a)):
a[i] = a[i].text
fo.write(a[i]+'\n')
fo.close()

@ -1,6 +1,36 @@
import socket
class CUI:
def __init__(self, user_name, login_time, login_state, state, cookie, st=None):
self.user_name = user_name
self.login_time = login_time
self.login_state = login_state
self.state = state
self.cookie = cookie
self.socket = st
class global_var:
"""需要定义全局变量的放在这里,最好定义一个初始值"""
free_spiders = []
current_user_info: list[CUI] = []
def exists(self, cookie):
for i in self.current_user_info:
if i.cookie == cookie:
return True
return False
def get_by_cookie(self, cookie):
for i in self.current_user_info:
if i.cookie == cookie:
return i
return None
def delete_user(self, cookie):
i = self.get_by_cookie(cookie)
self.current_user_info.remove(i)
# 对于每个全局变量都需要定义get_value和set_value接口
@ -14,3 +44,22 @@ def get_free_spiders():
def delete_spider_by_id(spider_info):
global_var.free_spiders.remove(spider_info)
def get_free_sockets() -> tuple[socket.socket]:
fs: list[socket.socket] = []
for i in global_var.current_user_info:
if i.state == 'free':
fs.append(i.socket)
return tuple(fs)
def add_user(user_name, login_time, login_state, state, cookie, st=None):
global_var.current_user_info.append(CUI(user_name, login_time, login_state, state, cookie, st))
def set_state_socket(cookie, state):
for i in global_var.current_user_info:
if i.cookie == cookie:
i.state = state
break

@ -0,0 +1,18 @@
from hashlib import *
class Cookie:
def __init__(self, user_name: str, login_time: str, login_state: str, cookie=None):
self.user_name = user_name
self.login_time = login_time
self.login_state = login_state
self.cookie = cookie
def generate_cookie(self):
s1 = sha1()
s1.update(str(self.user_name+self.login_time+self.login_state).encode())
self.cookie = s1.hexdigest()
return self.cookie
def __str__(self):
return self.cookie

@ -1,5 +1,7 @@
from hashlib import *
import pymysql
import dcs.tests.config as config
import dcs.tests.cookie as cookie
# 获取数据库连接对象
@ -41,7 +43,7 @@ def register(u_name, u_pwd):
print(e)
def login(u_name, u_pwd):
def login(u_name, u_pwd, st):
s1 = sha1()
s1.update(u_pwd.encode())
sha_pwd = s1.hexdigest()
@ -58,9 +60,9 @@ def login(u_name, u_pwd):
# res有值用户名正确判断密码正确与否
m_pwd = res[0]
if m_pwd == sha_pwd:
info = '用户' + u_name + '登录成功'
update_sql = f'update user_info set login_state = true where username = {u_name}'
cur.execute(update_sql)
# info = '用户' + u_name + '登录成功'
info = cookie.Cookie(u_name, 'time', 'true').generate_cookie()
config.add_user(u_name, 'time', 'true', 'busy', info, st)
conn.commit()
else:
info = '密码错误,登录失败'

@ -2,8 +2,6 @@ import socket
import threading
import json
import struct
import dcs.tests.config
import dcs.tests.database as database
from loguru import logger
from dcs.tests.spider import Spider
from dcs.tests.user_request_handler import Urh
@ -46,7 +44,13 @@ class RequestHandler(threading.Thread):
def crawl_zhiwang(self):
logger.info(f"[REQUEST] crawl zhiwang")
spider = Spider(self.request_map['word'])
try:
pages_start = int(self.request_map['pages_start'])
pages_end = int(self.request_map['pages_end'])
except:
pages_start = 1
pages_end = 1
spider = Spider(self.request_map['word'], pages_start=pages_start, pages_end=pages_end)
spider.run()
response = {
'crawl zhiwang': 'success' # TODO
@ -54,36 +58,6 @@ class RequestHandler(threading.Thread):
self.client_socket.sendall(generate_response(response))
logger.info(f"[RESPONSE] crawl zhiwang: {response['crawl zhiwang']}")
def report_state(self, state):
logger.info(f"[REQUEST] report free")
if self.request_map['spider_info'] not in dcs.tests.config.get_free_spiders():
dcs.tests.config.add_free_spider(self.request_map['spider_info'])
response = {
'report_free': 'success marked ' + str(self.request_map['spider_info'])
}
self.client_socket.sendall(generate_response(response))
logger.info(f"[RESPONSE] report free: {response['report_free']}")
def login(self, user, password):
logger.info(f"[REQUEST] login")
database.mysql_conn()
response = database.login(user, password)
response = {
'login': response
}
self.client_socket.sendall(generate_response(response))
logger.info(f"[RESPONSE] login: {response['login']}")
def register(self, user, password):
logger.info(f"[REQUEST] register")
database.mysql_conn()
response = database.register(user, password)
response = {
'register': response
}
self.client_socket.sendall(generate_response(response))
logger.info(f"[RESPONSE] register: {response['register']}")
def run(self) -> None:
try:
if self.request_map['action'] == 'test':
@ -99,4 +73,3 @@ class RequestHandler(threading.Thread):
urh.start()
except:
pass

@ -4,7 +4,6 @@ import threading
import dcs.tests.config
from msedge.selenium_tools import Edge
from msedge.selenium_tools import EdgeOptions
from dcs.tests.zhiwang import *
from loguru import logger
from dcs.tests.database import write_result2database, get_last_crawl_id, create_crawl_result_table
@ -91,11 +90,15 @@ class Spider(threading.Thread):
pass
def distribute_spiders(self):
free_spiders = dcs.tests.config.get_free_spiders()
for sp in free_spiders:
pass
print(self.pages_start, sp)
# TODO 发布任务
free_sockets = dcs.tests.config.get_free_sockets()
qt = (self.pages_end - self.pages_start) // (len(free_sockets)+1) // len(free_sockets)
for st in range(len(free_sockets)):
request_map = {'action': 'crawl_zhiwang', 'pages_start': f'{st}', 'pages_end': f'{st+qt}'}
self.pages_start = st + qt
'''
r = RequestHandler(self, free_sockets[st], request_map)
r.start()
'''
def run(self) -> None:
logger.info('crawling...')

@ -1,9 +1,8 @@
import threading
import socket
import threading
import json
import struct
import dcs.tests.config
import dcs.tests.config as config
import dcs.tests.database as database
from loguru import logger
@ -25,18 +24,17 @@ class Urh(threading.Thread):
def report_state(self, state):
logger.info(f"[REQUEST] report free")
if self.request_map['spider_info'] not in dcs.tests.config.get_free_spiders():
dcs.tests.config.add_free_spider(self.request_map['spider_info'])
config.set_state_socket(self.request_map['cookie'], state)
response = {
'report_free': 'success marked ' + str(self.request_map['spider_info'])
'report_free': 'success marked ' + str(self.request_map['cookie'])
}
self.client_socket.sendall(generate_response(response))
logger.info(f"[RESPONSE] report free: {response['report_free']}")
def login(self, user, password):
def login(self, user, password, st):
logger.info(f"[REQUEST] login")
database.mysql_conn()
response = database.login(user, password)
response = database.login(user, password, st)
response = {
'login': response
}
@ -57,7 +55,7 @@ class Urh(threading.Thread):
if self.request_map['action'] == 'report_free':
self.report_state('free')
elif self.request_map['action'] == 'login':
self.login(self.request_map['user'], self.request_map['password'])
self.login(self.request_map['user'], self.request_map['password'], self.client_socket)
elif self.request_map['action'] == 'register':
self.register(self.request_map['user'], self.request_map['password'])
else:

Loading…
Cancel
Save