较为完整的代码

master
wufayuan 2 years ago
parent 846d44206e
commit 592d6f9941

@ -30,6 +30,8 @@ class global_var:
up = None
communicator = None
server_socket = None
configs = None
test = None
def get_free_sockets() -> tuple[socket.socket]:

@ -3,3 +3,6 @@ port = 7777
daemon = True
buffer_size = 8 * 1024 * 1024
[crawler]
edge_driver_path = G:\course\yykf\dcs\bin\msedgedriver.exe

@ -2,6 +2,7 @@ import json
import socket
import struct
import threading
from configparser import ConfigParser
from json import JSONDecoder
from time import sleep
@ -17,8 +18,16 @@ from dcs.tools.message_process import parse_request, generate_response
def crawl_zhiwang(word, pages_start, pages_end):
edge_options = EdgeOptions()
edge_options.use_chromium = True
No_Image_loading = {"profile.managed_default_content_settings.images": 2, 'permissions.default.stylesheet': 2}
edge_options.add_experimental_option("prefs", No_Image_loading)
edge_options.add_argument('--headless')
driver = Edge(options=edge_options, executable_path=r'G:\course\yykf\dcs\bin\msedgedriver.exe')
configFile = '../../conf/settings.ini'
con = ConfigParser()
con.read(configFile, encoding='utf-8')
items = con.items('crawler')
items = dict(items)['edge_driver_path']
print(items)
driver = Edge(options=edge_options, executable_path=items)
soup = driver_open(driver, word)
papers = [] # 用于保存爬取到的论文
@ -85,19 +94,21 @@ class Crawl(threading.Thread):
crawl = Crawl()
crawl.start()
# res = crawl.crawl({'action': 'crawl zhiwang', 'word': 'science', 'pages_start': 1, 'pages_end': 2, 'cookie': '123'})
# logger.debug(res)
with socket.socket(socket.AF_INET, socket.SOCK_STREAM, socket.IPPROTO_TCP) as socket_to_server:
socket_to_server.bind(('127.0.0.1', 9999))
socket_to_server.connect(('127.0.0.1', 7777))
# request = {'action': 'register', 'user': 'wufayuan', 'password': '113818'}
# socket_to_server.sendall(mp.generate_request(request))
# responseJson = JSONDecoder().decode(
# mp.read_bytes(socket_to_server, struct.unpack('!Q', socket_to_server.recv(8))[0]).decode(
# "utf-8"))
# print(responseJson)
request = {'action': 'register', 'user': 'liuxiaoyu', 'password': '113818'}
socket_to_server.sendall(mp.generate_request(request))
responseJson = JSONDecoder().decode(
mp.read_bytes(socket_to_server, struct.unpack('!Q', socket_to_server.recv(8))[0]).decode(
"utf-8"))
print(responseJson)
request = {'action': 'login', 'user': 'wufayuan', 'password': '113818'}
request = {'action': 'login', 'user': 'liuxiaoyu', 'password': '113818'}
socket_to_server.sendall(mp.generate_request(request))
responseJson = JSONDecoder().decode(
mp.read_bytes(socket_to_server, struct.unpack('!Q', socket_to_server.recv(8))[0]).decode(

File diff suppressed because one or more lines are too long

@ -16,6 +16,7 @@ logger.debug('reading config args...')
configFile = '../conf/settings.ini'
con = ConfigParser()
con.read(configFile, encoding='utf-8')
global_var.configs = con
items = con.items('server')
items = dict(items)

@ -12,7 +12,6 @@ class RequestHandler(threading.Thread):
self.client_socket = client_socket
def run(self) -> None:
request_map = None
try:
while True:
request_map = parse_request(self.client_socket)
@ -33,6 +32,7 @@ class RequestHandler(threading.Thread):
logger.error(f"no action {request_map['action']}!")
global_var.communicator.add_response('error', self.client_socket,
{request_map['action']: f"no action {request_map['action']}!"})
# break
except Exception as e:
logger.error(str(e))
global_var.communicator.add_response('error', self.client_socket, {request_map['action']: str(e)})
# global_var.communicator.add_response('error', self.client_socket, {request_map['action']: str(e)})

@ -1,6 +1,7 @@
import socket
import threading
from typing import Optional
import multiprocessing
from loguru import logger
from msedge.selenium_tools import Edge
@ -42,13 +43,18 @@ class Crawler(threading.Thread):
super(Crawler, self).__init__()
self.partial_task = partial_task
self.last_crawl_id = last_crawl_id
self.edge_driver_path = dict(global_var.configs.items('crawler'))['edge_driver_path']
def crawl_zhiwang(self, user_name=None):
edge_options = EdgeOptions()
edge_options.use_chromium = True
edge_options.add_argument('--headless')
driver = Edge(options=edge_options, executable_path=r'G:\course\yykf\dcs\bin\msedgedriver.exe')
No_Image_loading = {"profile.managed_default_content_settings.images": 2, 'permissions.default.stylesheet': 2}
edge_options.add_experimental_option("prefs", No_Image_loading)
driver = Edge(options=edge_options, executable_path=self.edge_driver_path)
soup = driver_open(driver, self.partial_task.word) # 搜索word
# logger.debug(self.last_crawl_id)
# logger.debug(self.partial_task.word)
papers = [] # 用于保存爬取到的论文
table_name = f'{user_name}_crawl_result'
@ -92,10 +98,12 @@ class Crawler(threading.Thread):
def run(self) -> None:
try:
# self.crawl_zhiwang(user_name=self.partial_task.cui.user_name)
self.test_simulation(user_name=self.partial_task.cui.user_name)
self.crawl_zhiwang(user_name=self.partial_task.cui.user_name)
# self.test_simulation(user_name=self.partial_task.cui.user_name)
except Exception as e:
print(e)
logger.error(str(e))
logger.error(e.__traceback__.tb_frame.f_globals["__file__"]) # 发生异常所在的文件
logger.error(e.__traceback__.tb_lineno) # 发生异常所在的行数
finally:
logger.info(f'partial crawl task finished: {str(self.partial_task)}')
self.partial_task.thread = None
@ -178,6 +186,8 @@ class Spider_task(threading.Thread):
logger.debug(local_result)
result.update(local_result)
result.update({'crawl_id': self.last_crawl_id+1, 'table_name': self.table_name})
global_var.communicator.add_info('response', self.client_socket.getpeername(), result)
def run(self) -> None:

@ -33,13 +33,13 @@ class Author:
def driver_open(driver, key_word):
url = "https://www.cnki.net/"
driver.get(url)
time.sleep(1)
# time.sleep(1)
driver.find_element(by=By.CSS_SELECTOR, value='#txt_SearchText').send_keys(key_word)
# time.sleep(2)
# 点击搜索按钮
driver.find_element(by=By.CSS_SELECTOR,
value='body > div.wrapper.section1 > div.searchmain > div > div.input-box > input.search-btn').click()
time.sleep(5) # 必须要等待
time.sleep(1) # 必须要等待
content = driver.page_source.encode('utf-8')
# driver.close()
soup = BeautifulSoup(content, 'lxml')
@ -47,6 +47,7 @@ def driver_open(driver, key_word):
def spider(driver, soup, papers):
logger.debug("crawling a soup...")
tbody = soup.find_all('tbody')
try:
tbody = BeautifulSoup(str(tbody[0]), 'lxml')
@ -55,6 +56,7 @@ def spider(driver, soup, papers):
return
tr = tbody.find_all('tr')
for item in tr:
logger.debug("crawling an item...")
tr_bf = BeautifulSoup(str(item), 'lxml')
td_name = tr_bf.find_all('td', class_='name')
@ -82,13 +84,15 @@ def spider(driver, soup, papers):
print('\n')
paper = Paper(title, authors)
papers.append(paper)
papers.append(paper)
break
# time.sleep(1) # 每调一次spider休息1s
# pn表示当前要爬的页数
def change_page(driver, pn):
driver.find_element(by=By.CSS_SELECTOR, value='#page' + str(pn)).click()
time.sleep(5)
time.sleep(1)
content = driver.page_source.encode('utf-8')
soup = BeautifulSoup(content, 'lxml')
return soup

@ -7,9 +7,9 @@ import dcs.tools.cookie as cookie
# 获取数据库连接对象
def mysql_conn(host='127.0.0.1', user='root', passwd='xwdjzwy5252', db='test'):
def mysql_conn(host='10.129.16.155', user='root', passwd='427318Aa', db='test'):
try:
logger.debug('connecting to database...')
# logger.debug('connecting to database...')
conn = pymysql.connect(host=host, user=user, passwd=passwd, db=db)
return conn
except Exception as e:
@ -17,9 +17,10 @@ def mysql_conn(host='127.0.0.1', user='root', passwd='xwdjzwy5252', db='test'):
def register(u_name, u_pwd):
s1 = sha1()
s1.update(u_pwd.encode())
sha_pwd = s1.hexdigest()
# s1 = sha1()
# s1.update(u_pwd.encode())
# sha_pwd = s1.hexdigest()
sha_pwd = u_pwd
try:
# 获取数据库连接对象
conn = mysql_conn()
@ -127,6 +128,7 @@ def get_last_crawl_id(table_name: str) -> int:
return last_crawl_id
except Exception as e:
print(e)
return 0
def drop_table(table_name: str):

@ -6,8 +6,10 @@ from conf.config import exists
def parse_request(client_socket: socket.socket):
request_header_size = struct.unpack("!Q", read_bytes(client_socket, 8))[0]
request_map = json.JSONDecoder().decode(read_bytes(client_socket, request_header_size).decode("utf-8"))
data = read_bytes(client_socket, 8)
request_header_size = struct.unpack("!Q", data)[0]
data = read_bytes(client_socket, request_header_size)
request_map = json.JSONDecoder().decode(data.decode("utf-8"))
return request_map

@ -40,7 +40,7 @@ def read_bytes(s: 'socket.socket', size: 'int') -> 'bytes':
def send_request(ip, port, request_info):
with socket.socket(socket.AF_INET, socket.SOCK_STREAM, socket.IPPROTO_TCP) as socket_to_server:
socket_to_server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
socket_to_server.bind(('', 9002))
socket_to_server.bind(('', 9012))
socket_to_server.connect((ip, int(port)))
full_request = generate_request(request_info)
@ -57,13 +57,13 @@ def send_request(ip, port, request_info):
def receive_response():
server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM, socket.IPPROTO_TCP)
server_socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
server_socket.bind(('', 9002))
server_socket.bind(('', 9012))
server_socket.listen()
while True:
client_socket, _ = server_socket.accept()
request_map = parse_request(client_socket)
if request_map['type'] == 'response':
print("receiving response:\n" + json.dumps(request_map, ensure_ascii=False))
# while True:
client_socket, _ = server_socket.accept()
request_map = parse_request(client_socket)
if request_map['type'] == 'response':
print("receiving response:\n" + json.dumps(request_map, ensure_ascii=False))
if __name__ == '__main__':

@ -1 +1 @@
python .\connect.py --ip 127.0.0.1 --port 7777 login --user wufayuan --password 113818
python .\connect.py --ip 127.0.0.1 --port 7777 login --user yuu --password yuu

Loading…
Cancel
Save