较为完整的代码

master
wufayuan 3 years ago
parent 846d44206e
commit 592d6f9941

@ -30,6 +30,8 @@ class global_var:
up = None up = None
communicator = None communicator = None
server_socket = None server_socket = None
configs = None
test = None
def get_free_sockets() -> tuple[socket.socket]: def get_free_sockets() -> tuple[socket.socket]:

@ -3,3 +3,6 @@ port = 7777
daemon = True daemon = True
buffer_size = 8 * 1024 * 1024 buffer_size = 8 * 1024 * 1024
[crawler]
edge_driver_path = G:\course\yykf\dcs\bin\msedgedriver.exe

@ -2,6 +2,7 @@ import json
import socket import socket
import struct import struct
import threading import threading
from configparser import ConfigParser
from json import JSONDecoder from json import JSONDecoder
from time import sleep from time import sleep
@ -17,8 +18,16 @@ from dcs.tools.message_process import parse_request, generate_response
def crawl_zhiwang(word, pages_start, pages_end): def crawl_zhiwang(word, pages_start, pages_end):
edge_options = EdgeOptions() edge_options = EdgeOptions()
edge_options.use_chromium = True edge_options.use_chromium = True
No_Image_loading = {"profile.managed_default_content_settings.images": 2, 'permissions.default.stylesheet': 2}
edge_options.add_experimental_option("prefs", No_Image_loading)
edge_options.add_argument('--headless') edge_options.add_argument('--headless')
driver = Edge(options=edge_options, executable_path=r'G:\course\yykf\dcs\bin\msedgedriver.exe') configFile = '../../conf/settings.ini'
con = ConfigParser()
con.read(configFile, encoding='utf-8')
items = con.items('crawler')
items = dict(items)['edge_driver_path']
print(items)
driver = Edge(options=edge_options, executable_path=items)
soup = driver_open(driver, word) soup = driver_open(driver, word)
papers = [] # 用于保存爬取到的论文 papers = [] # 用于保存爬取到的论文
@ -85,19 +94,21 @@ class Crawl(threading.Thread):
crawl = Crawl() crawl = Crawl()
crawl.start() crawl.start()
# res = crawl.crawl({'action': 'crawl zhiwang', 'word': 'science', 'pages_start': 1, 'pages_end': 2, 'cookie': '123'})
# logger.debug(res)
with socket.socket(socket.AF_INET, socket.SOCK_STREAM, socket.IPPROTO_TCP) as socket_to_server: with socket.socket(socket.AF_INET, socket.SOCK_STREAM, socket.IPPROTO_TCP) as socket_to_server:
socket_to_server.bind(('127.0.0.1', 9999)) socket_to_server.bind(('127.0.0.1', 9999))
socket_to_server.connect(('127.0.0.1', 7777)) socket_to_server.connect(('127.0.0.1', 7777))
# request = {'action': 'register', 'user': 'wufayuan', 'password': '113818'} request = {'action': 'register', 'user': 'liuxiaoyu', 'password': '113818'}
# socket_to_server.sendall(mp.generate_request(request)) socket_to_server.sendall(mp.generate_request(request))
# responseJson = JSONDecoder().decode( responseJson = JSONDecoder().decode(
# mp.read_bytes(socket_to_server, struct.unpack('!Q', socket_to_server.recv(8))[0]).decode( mp.read_bytes(socket_to_server, struct.unpack('!Q', socket_to_server.recv(8))[0]).decode(
# "utf-8")) "utf-8"))
# print(responseJson) print(responseJson)
request = {'action': 'login', 'user': 'wufayuan', 'password': '113818'} request = {'action': 'login', 'user': 'liuxiaoyu', 'password': '113818'}
socket_to_server.sendall(mp.generate_request(request)) socket_to_server.sendall(mp.generate_request(request))
responseJson = JSONDecoder().decode( responseJson = JSONDecoder().decode(
mp.read_bytes(socket_to_server, struct.unpack('!Q', socket_to_server.recv(8))[0]).decode( mp.read_bytes(socket_to_server, struct.unpack('!Q', socket_to_server.recv(8))[0]).decode(

File diff suppressed because one or more lines are too long

@ -16,6 +16,7 @@ logger.debug('reading config args...')
configFile = '../conf/settings.ini' configFile = '../conf/settings.ini'
con = ConfigParser() con = ConfigParser()
con.read(configFile, encoding='utf-8') con.read(configFile, encoding='utf-8')
global_var.configs = con
items = con.items('server') items = con.items('server')
items = dict(items) items = dict(items)

@ -12,7 +12,6 @@ class RequestHandler(threading.Thread):
self.client_socket = client_socket self.client_socket = client_socket
def run(self) -> None: def run(self) -> None:
request_map = None
try: try:
while True: while True:
request_map = parse_request(self.client_socket) request_map = parse_request(self.client_socket)
@ -33,6 +32,7 @@ class RequestHandler(threading.Thread):
logger.error(f"no action {request_map['action']}!") logger.error(f"no action {request_map['action']}!")
global_var.communicator.add_response('error', self.client_socket, global_var.communicator.add_response('error', self.client_socket,
{request_map['action']: f"no action {request_map['action']}!"}) {request_map['action']: f"no action {request_map['action']}!"})
# break
except Exception as e: except Exception as e:
logger.error(str(e)) logger.error(str(e))
global_var.communicator.add_response('error', self.client_socket, {request_map['action']: str(e)}) # global_var.communicator.add_response('error', self.client_socket, {request_map['action']: str(e)})

@ -1,6 +1,7 @@
import socket import socket
import threading import threading
from typing import Optional from typing import Optional
import multiprocessing
from loguru import logger from loguru import logger
from msedge.selenium_tools import Edge from msedge.selenium_tools import Edge
@ -42,13 +43,18 @@ class Crawler(threading.Thread):
super(Crawler, self).__init__() super(Crawler, self).__init__()
self.partial_task = partial_task self.partial_task = partial_task
self.last_crawl_id = last_crawl_id self.last_crawl_id = last_crawl_id
self.edge_driver_path = dict(global_var.configs.items('crawler'))['edge_driver_path']
def crawl_zhiwang(self, user_name=None): def crawl_zhiwang(self, user_name=None):
edge_options = EdgeOptions() edge_options = EdgeOptions()
edge_options.use_chromium = True edge_options.use_chromium = True
edge_options.add_argument('--headless') edge_options.add_argument('--headless')
driver = Edge(options=edge_options, executable_path=r'G:\course\yykf\dcs\bin\msedgedriver.exe') No_Image_loading = {"profile.managed_default_content_settings.images": 2, 'permissions.default.stylesheet': 2}
edge_options.add_experimental_option("prefs", No_Image_loading)
driver = Edge(options=edge_options, executable_path=self.edge_driver_path)
soup = driver_open(driver, self.partial_task.word) # 搜索word soup = driver_open(driver, self.partial_task.word) # 搜索word
# logger.debug(self.last_crawl_id)
# logger.debug(self.partial_task.word)
papers = [] # 用于保存爬取到的论文 papers = [] # 用于保存爬取到的论文
table_name = f'{user_name}_crawl_result' table_name = f'{user_name}_crawl_result'
@ -92,10 +98,12 @@ class Crawler(threading.Thread):
def run(self) -> None: def run(self) -> None:
try: try:
# self.crawl_zhiwang(user_name=self.partial_task.cui.user_name) self.crawl_zhiwang(user_name=self.partial_task.cui.user_name)
self.test_simulation(user_name=self.partial_task.cui.user_name) # self.test_simulation(user_name=self.partial_task.cui.user_name)
except Exception as e: except Exception as e:
print(e) logger.error(str(e))
logger.error(e.__traceback__.tb_frame.f_globals["__file__"]) # 发生异常所在的文件
logger.error(e.__traceback__.tb_lineno) # 发生异常所在的行数
finally: finally:
logger.info(f'partial crawl task finished: {str(self.partial_task)}') logger.info(f'partial crawl task finished: {str(self.partial_task)}')
self.partial_task.thread = None self.partial_task.thread = None
@ -178,6 +186,8 @@ class Spider_task(threading.Thread):
logger.debug(local_result) logger.debug(local_result)
result.update(local_result) result.update(local_result)
result.update({'crawl_id': self.last_crawl_id+1, 'table_name': self.table_name})
global_var.communicator.add_info('response', self.client_socket.getpeername(), result) global_var.communicator.add_info('response', self.client_socket.getpeername(), result)
def run(self) -> None: def run(self) -> None:

@ -33,13 +33,13 @@ class Author:
def driver_open(driver, key_word): def driver_open(driver, key_word):
url = "https://www.cnki.net/" url = "https://www.cnki.net/"
driver.get(url) driver.get(url)
time.sleep(1) # time.sleep(1)
driver.find_element(by=By.CSS_SELECTOR, value='#txt_SearchText').send_keys(key_word) driver.find_element(by=By.CSS_SELECTOR, value='#txt_SearchText').send_keys(key_word)
# time.sleep(2) # time.sleep(2)
# 点击搜索按钮 # 点击搜索按钮
driver.find_element(by=By.CSS_SELECTOR, driver.find_element(by=By.CSS_SELECTOR,
value='body > div.wrapper.section1 > div.searchmain > div > div.input-box > input.search-btn').click() value='body > div.wrapper.section1 > div.searchmain > div > div.input-box > input.search-btn').click()
time.sleep(5) # 必须要等待 time.sleep(1) # 必须要等待
content = driver.page_source.encode('utf-8') content = driver.page_source.encode('utf-8')
# driver.close() # driver.close()
soup = BeautifulSoup(content, 'lxml') soup = BeautifulSoup(content, 'lxml')
@ -47,6 +47,7 @@ def driver_open(driver, key_word):
def spider(driver, soup, papers): def spider(driver, soup, papers):
logger.debug("crawling a soup...")
tbody = soup.find_all('tbody') tbody = soup.find_all('tbody')
try: try:
tbody = BeautifulSoup(str(tbody[0]), 'lxml') tbody = BeautifulSoup(str(tbody[0]), 'lxml')
@ -55,6 +56,7 @@ def spider(driver, soup, papers):
return return
tr = tbody.find_all('tr') tr = tbody.find_all('tr')
for item in tr: for item in tr:
logger.debug("crawling an item...")
tr_bf = BeautifulSoup(str(item), 'lxml') tr_bf = BeautifulSoup(str(item), 'lxml')
td_name = tr_bf.find_all('td', class_='name') td_name = tr_bf.find_all('td', class_='name')
@ -82,13 +84,15 @@ def spider(driver, soup, papers):
print('\n') print('\n')
paper = Paper(title, authors) paper = Paper(title, authors)
papers.append(paper) papers.append(paper)
papers.append(paper)
break
# time.sleep(1) # 每调一次spider休息1s # time.sleep(1) # 每调一次spider休息1s
# pn表示当前要爬的页数 # pn表示当前要爬的页数
def change_page(driver, pn): def change_page(driver, pn):
driver.find_element(by=By.CSS_SELECTOR, value='#page' + str(pn)).click() driver.find_element(by=By.CSS_SELECTOR, value='#page' + str(pn)).click()
time.sleep(5) time.sleep(1)
content = driver.page_source.encode('utf-8') content = driver.page_source.encode('utf-8')
soup = BeautifulSoup(content, 'lxml') soup = BeautifulSoup(content, 'lxml')
return soup return soup

@ -7,9 +7,9 @@ import dcs.tools.cookie as cookie
# 获取数据库连接对象 # 获取数据库连接对象
def mysql_conn(host='127.0.0.1', user='root', passwd='xwdjzwy5252', db='test'): def mysql_conn(host='10.129.16.155', user='root', passwd='427318Aa', db='test'):
try: try:
logger.debug('connecting to database...') # logger.debug('connecting to database...')
conn = pymysql.connect(host=host, user=user, passwd=passwd, db=db) conn = pymysql.connect(host=host, user=user, passwd=passwd, db=db)
return conn return conn
except Exception as e: except Exception as e:
@ -17,9 +17,10 @@ def mysql_conn(host='127.0.0.1', user='root', passwd='xwdjzwy5252', db='test'):
def register(u_name, u_pwd): def register(u_name, u_pwd):
s1 = sha1() # s1 = sha1()
s1.update(u_pwd.encode()) # s1.update(u_pwd.encode())
sha_pwd = s1.hexdigest() # sha_pwd = s1.hexdigest()
sha_pwd = u_pwd
try: try:
# 获取数据库连接对象 # 获取数据库连接对象
conn = mysql_conn() conn = mysql_conn()
@ -127,6 +128,7 @@ def get_last_crawl_id(table_name: str) -> int:
return last_crawl_id return last_crawl_id
except Exception as e: except Exception as e:
print(e) print(e)
return 0
def drop_table(table_name: str): def drop_table(table_name: str):

@ -6,8 +6,10 @@ from conf.config import exists
def parse_request(client_socket: socket.socket): def parse_request(client_socket: socket.socket):
request_header_size = struct.unpack("!Q", read_bytes(client_socket, 8))[0] data = read_bytes(client_socket, 8)
request_map = json.JSONDecoder().decode(read_bytes(client_socket, request_header_size).decode("utf-8")) request_header_size = struct.unpack("!Q", data)[0]
data = read_bytes(client_socket, request_header_size)
request_map = json.JSONDecoder().decode(data.decode("utf-8"))
return request_map return request_map

@ -40,7 +40,7 @@ def read_bytes(s: 'socket.socket', size: 'int') -> 'bytes':
def send_request(ip, port, request_info): def send_request(ip, port, request_info):
with socket.socket(socket.AF_INET, socket.SOCK_STREAM, socket.IPPROTO_TCP) as socket_to_server: with socket.socket(socket.AF_INET, socket.SOCK_STREAM, socket.IPPROTO_TCP) as socket_to_server:
socket_to_server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) socket_to_server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
socket_to_server.bind(('', 9002)) socket_to_server.bind(('', 9012))
socket_to_server.connect((ip, int(port))) socket_to_server.connect((ip, int(port)))
full_request = generate_request(request_info) full_request = generate_request(request_info)
@ -57,13 +57,13 @@ def send_request(ip, port, request_info):
def receive_response(): def receive_response():
server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM, socket.IPPROTO_TCP) server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM, socket.IPPROTO_TCP)
server_socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) server_socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
server_socket.bind(('', 9002)) server_socket.bind(('', 9012))
server_socket.listen() server_socket.listen()
while True: # while True:
client_socket, _ = server_socket.accept() client_socket, _ = server_socket.accept()
request_map = parse_request(client_socket) request_map = parse_request(client_socket)
if request_map['type'] == 'response': if request_map['type'] == 'response':
print("receiving response:\n" + json.dumps(request_map, ensure_ascii=False)) print("receiving response:\n" + json.dumps(request_map, ensure_ascii=False))
if __name__ == '__main__': if __name__ == '__main__':

@ -1 +1 @@
python .\connect.py --ip 127.0.0.1 --port 7777 login --user wufayuan --password 113818 python .\connect.py --ip 127.0.0.1 --port 7777 login --user yuu --password yuu

Loading…
Cancel
Save