较为完整的代码

master
wufayuan 2 years ago
parent 1757411834
commit 5ffd4fd363

@ -21,6 +21,7 @@ class CUI:
class global_var:
"""需要定义全局变量的放在这里"""
connection = None
free_spiders = []
current_user_info: list[CUI] = []
requester = None

@ -1,23 +1,71 @@
import json
import socket
import struct
from json import JSONDecoder
import threading
from json import JSONDecoder
from time import sleep
from loguru import logger
from msedge.selenium_tools import Edge
from msedge.selenium_tools import EdgeOptions
from dcs.tests.zhiwang import *
from dcs.tools import message_process as mp
from dcs.tools.message_process import parse_request, generate_response
def crawl_zhiwang(word, pages_start, pages_end):
edge_options = EdgeOptions()
edge_options.use_chromium = True
# edge_options.add_argument('--headless')
driver = Edge(options=edge_options, executable_path=r'G:\course\yykf\dcs\bin\msedgedriver.exe')
soup = driver_open(driver, word)
papers = [] # 用于保存爬取到的论文
paper_id = 0
res = {} # 保存终端爬取结果
# 爬取第一篇
if pages_start == 1:
spider(driver, soup, papers)
logger.debug(res)
pages_start += 1
while paper_id < len(papers):
write2res(papers[paper_id], res)
paper_id += 1
while pages_start < pages_end:
content = change_page(driver, pages_start)
spider(driver, content, papers)
while paper_id < len(papers):
write2res(papers[paper_id], res)
paper_id += 1
pages_start += 1
driver.close()
logger.debug("here")
return res
def write2res(paper: Paper, res):
for author in paper.authors:
if author.name:
res.update(
{len(res): {'name': author.name, 'college': author.college, 'major': author.major, 'title': paper.title}})
class Crawl(threading.Thread):
def __init__(self):
super(Crawl, self).__init__()
self.server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM, socket.IPPROTO_TCP)
self.server_socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
self.server_socket.bind(('', 9000))
self.server_socket.bind(('', 9999))
@staticmethod
def crawl() -> dict:
result_map = {0: {'name': 'remote', 'college': 'remote', 'major': 'remote', 'title': 'remote'}}
def crawl(request_map) -> dict:
result_map = crawl_zhiwang(request_map['word'], request_map['pages_start'], request_map['pages_end'])
# result_map = {0: {'name': 'remote', 'college': 'remote', 'major': 'remote', 'title': 'remote'},
# 1: {'name': 'remote1', 'college': 'remote1', 'major': 'remote', 'title': 'remote'}}
logger.debug(result_map)
return result_map
def run(self) -> None:
@ -26,8 +74,8 @@ class Crawl(threading.Thread):
client_socket, _ = self.server_socket.accept()
request_map = parse_request(client_socket)
if request_map['type'] == 'request':
print("receiving help request:\n"+json.dumps(request_map, ensure_ascii=False))
response_map = self.crawl()
print("receiving help request:\n" + json.dumps(request_map, ensure_ascii=False))
response_map = self.crawl(request_map)
response_map.update({'cookie': request_map['cookie']})
client_socket.sendall(generate_response(response_map))
if request_map['type'] == 'response':
@ -37,42 +85,41 @@ class Crawl(threading.Thread):
crawl = Crawl()
crawl.start()
# crawl.join()
with socket.socket(socket.AF_INET, socket.SOCK_STREAM, socket.IPPROTO_TCP) as socket_to_server:
socket_to_server.bind(('127.0.0.1', 9000))
socket_to_server.bind(('127.0.0.1', 9999))
socket_to_server.connect(('127.0.0.1', 7777))
request = {'action': 'register', 'user': 'wufayuan', 'password': '113818'}
# request = {'action': 'register', 'user': 'wufayuan', 'password': '113818'}
# socket_to_server.sendall(mp.generate_request(request))
# responseJson = JSONDecoder().decode(
# mp.read_bytes(socket_to_server, struct.unpack('!Q', socket_to_server.recv(8))[0]).decode(
# "utf-8"))
# print(responseJson)
request = {'action': 'login', 'user': 'wufayuan', 'password': '113818'}
socket_to_server.sendall(mp.generate_request(request))
responseJson = JSONDecoder().decode(
mp.read_bytes(socket_to_server, struct.unpack('!Q', socket_to_server.recv(8))[0]).decode(
"utf-8"))
cookie = responseJson['cookie']
print(responseJson)
request = {'action': 'login', 'user': 'wufayuan', 'password': '113818'}
request = {'action': 'report_free', 'cookie': cookie}
socket_to_server.sendall(mp.generate_request(request))
responseJson = JSONDecoder().decode(
mp.read_bytes(socket_to_server, struct.unpack('!Q', socket_to_server.recv(8))[0]).decode(
"utf-8"))
cookie = responseJson['cookie']
print(responseJson)
request = {'action': 'report_free', 'cookie': cookie}
request = {'action': 'crawl zhiwang', 'word': 'science', 'pages_start': 1, 'pages_end': 3,
'cookie': cookie}
socket_to_server.sendall(mp.generate_request(request))
responseJson = JSONDecoder().decode(
mp.read_bytes(socket_to_server, struct.unpack('!Q', socket_to_server.recv(8))[0]).decode(
"utf-8"))
print(responseJson)
# request = {'action': 'crawl zhiwang', 'word': 'computer', 'pages_start': 1, 'pages_end': 10,
# 'cookie': cookie}
# socket_to_server.sendall(mp.generate_request(request))
# responseJson = JSONDecoder().decode(
# mp.read_bytes(socket_to_server, struct.unpack('!Q', socket_to_server.recv(8))[0]).decode(
# "utf-8"))
# print(responseJson)
request = {'action': 'end'}
socket_to_server.sendall(mp.generate_request(request))

File diff suppressed because one or more lines are too long

@ -1,4 +1,3 @@
import csv
import socket
import threading
from typing import Optional
@ -8,9 +7,9 @@ from msedge.selenium_tools import Edge
from msedge.selenium_tools import EdgeOptions
from conf.config import global_var, get_free_sockets, get_crawl_result, get_by_cookie, set_state_socket
from dcs.tests.zhiwang import *
from dcs.tools.database import get_crawl_result_by_crawl_id, write_result2database
from dcs.tools.database import get_last_crawl_id, create_crawl_result_table
from dcs.tests.zhiwang import *
def write2database(paper: Paper, table_name: str, last_crawl_id: int):
@ -47,7 +46,7 @@ class Crawler(threading.Thread):
def crawl_zhiwang(self, user_name=None):
edge_options = EdgeOptions()
edge_options.use_chromium = True
# edge_options.add_argument('headless')
edge_options.add_argument('--headless')
driver = Edge(options=edge_options, executable_path=r'G:\course\yykf\dcs\bin\msedgedriver.exe')
soup = driver_open(driver, self.partial_task.word) # 搜索word
@ -130,7 +129,7 @@ class Spider_task(threading.Thread):
self.client_socket = client_socket
self.request_map = request_map
self.partial_tasks: list[Spider_partial_task] = []
self.const_page = 3
self.const_page = 1
def distribute_task(self):
# distribute tasks, 3 pages as a task

@ -6,6 +6,7 @@ import time
import requests
# 定义论文类
from loguru import logger
from msedge.selenium_tools import webdriver
from msedge.selenium_tools.service import Service
from selenium.webdriver.common.by import By
@ -30,21 +31,15 @@ class Author:
# 进入知网首页并搜索关键词
def driver_open(driver, key_word):
from msedge.selenium_tools import EdgeOptions
url = "https://www.cnki.net/"
edge_options = EdgeOptions()
edge_options.use_chromium = True
# 设置无界面模式,也可以添加其它设置
edge_options.add_argument('--headless')
url = "https://www.cnki.net/"
driver.get(url)
time.sleep(2)
time.sleep(1)
driver.find_element(by=By.CSS_SELECTOR, value='#txt_SearchText').send_keys(key_word)
time.sleep(2)
# time.sleep(2)
# 点击搜索按钮
driver.find_element(by=By.CSS_SELECTOR,
value='body > div.wrapper.section1 > div.searchmain > div > div.input-box > input.search-btn').click()
time.sleep(5)
time.sleep(5) # 必须要等待
content = driver.page_source.encode('utf-8')
# driver.close()
soup = BeautifulSoup(content, 'lxml')
@ -55,7 +50,8 @@ def spider(driver, soup, papers):
tbody = soup.find_all('tbody')
try:
tbody = BeautifulSoup(str(tbody[0]), 'lxml')
except:
except Exception as e:
logger.error(str(e))
return
tr = tbody.find_all('tr')
for item in tr:
@ -86,7 +82,7 @@ def spider(driver, soup, papers):
print('\n')
paper = Paper(title, authors)
papers.append(paper)
time.sleep(1) # 每调一次spider休息1s
# time.sleep(1) # 每调一次spider休息1s
# pn表示当前要爬的页数

@ -159,6 +159,7 @@ def get_crawl_result_by_crawl_id(table_name: str, crawl_id: int):
def create_table(create_sql: str):
try:
conn = mysql_conn()
cur = conn.cursor()
cur.execute(create_sql)
conn.commit()

Loading…
Cancel
Save