较为完整的代码

master
wufayuan 3 years ago
parent 1757411834
commit 5ffd4fd363

@ -21,6 +21,7 @@ class CUI:
class global_var: class global_var:
"""需要定义全局变量的放在这里""" """需要定义全局变量的放在这里"""
connection = None
free_spiders = [] free_spiders = []
current_user_info: list[CUI] = [] current_user_info: list[CUI] = []
requester = None requester = None

@ -1,23 +1,71 @@
import json import json
import socket import socket
import struct import struct
from json import JSONDecoder
import threading import threading
from json import JSONDecoder
from time import sleep
from loguru import logger
from msedge.selenium_tools import Edge
from msedge.selenium_tools import EdgeOptions
from dcs.tests.zhiwang import *
from dcs.tools import message_process as mp from dcs.tools import message_process as mp
from dcs.tools.message_process import parse_request, generate_response from dcs.tools.message_process import parse_request, generate_response
def crawl_zhiwang(word, pages_start, pages_end):
edge_options = EdgeOptions()
edge_options.use_chromium = True
# edge_options.add_argument('--headless')
driver = Edge(options=edge_options, executable_path=r'G:\course\yykf\dcs\bin\msedgedriver.exe')
soup = driver_open(driver, word)
papers = [] # 用于保存爬取到的论文
paper_id = 0
res = {} # 保存终端爬取结果
# 爬取第一篇
if pages_start == 1:
spider(driver, soup, papers)
logger.debug(res)
pages_start += 1
while paper_id < len(papers):
write2res(papers[paper_id], res)
paper_id += 1
while pages_start < pages_end:
content = change_page(driver, pages_start)
spider(driver, content, papers)
while paper_id < len(papers):
write2res(papers[paper_id], res)
paper_id += 1
pages_start += 1
driver.close()
logger.debug("here")
return res
def write2res(paper: Paper, res):
for author in paper.authors:
if author.name:
res.update(
{len(res): {'name': author.name, 'college': author.college, 'major': author.major, 'title': paper.title}})
class Crawl(threading.Thread): class Crawl(threading.Thread):
def __init__(self): def __init__(self):
super(Crawl, self).__init__() super(Crawl, self).__init__()
self.server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM, socket.IPPROTO_TCP) self.server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM, socket.IPPROTO_TCP)
self.server_socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) self.server_socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
self.server_socket.bind(('', 9000)) self.server_socket.bind(('', 9999))
@staticmethod @staticmethod
def crawl() -> dict: def crawl(request_map) -> dict:
result_map = {0: {'name': 'remote', 'college': 'remote', 'major': 'remote', 'title': 'remote'}} result_map = crawl_zhiwang(request_map['word'], request_map['pages_start'], request_map['pages_end'])
# result_map = {0: {'name': 'remote', 'college': 'remote', 'major': 'remote', 'title': 'remote'},
# 1: {'name': 'remote1', 'college': 'remote1', 'major': 'remote', 'title': 'remote'}}
logger.debug(result_map)
return result_map return result_map
def run(self) -> None: def run(self) -> None:
@ -27,7 +75,7 @@ class Crawl(threading.Thread):
request_map = parse_request(client_socket) request_map = parse_request(client_socket)
if request_map['type'] == 'request': if request_map['type'] == 'request':
print("receiving help request:\n" + json.dumps(request_map, ensure_ascii=False)) print("receiving help request:\n" + json.dumps(request_map, ensure_ascii=False))
response_map = self.crawl() response_map = self.crawl(request_map)
response_map.update({'cookie': request_map['cookie']}) response_map.update({'cookie': request_map['cookie']})
client_socket.sendall(generate_response(response_map)) client_socket.sendall(generate_response(response_map))
if request_map['type'] == 'response': if request_map['type'] == 'response':
@ -37,42 +85,41 @@ class Crawl(threading.Thread):
crawl = Crawl() crawl = Crawl()
crawl.start() crawl.start()
# crawl.join()
with socket.socket(socket.AF_INET, socket.SOCK_STREAM, socket.IPPROTO_TCP) as socket_to_server: with socket.socket(socket.AF_INET, socket.SOCK_STREAM, socket.IPPROTO_TCP) as socket_to_server:
socket_to_server.bind(('127.0.0.1', 9000)) socket_to_server.bind(('127.0.0.1', 9999))
socket_to_server.connect(('127.0.0.1', 7777)) socket_to_server.connect(('127.0.0.1', 7777))
request = {'action': 'register', 'user': 'wufayuan', 'password': '113818'} # request = {'action': 'register', 'user': 'wufayuan', 'password': '113818'}
# socket_to_server.sendall(mp.generate_request(request))
# responseJson = JSONDecoder().decode(
# mp.read_bytes(socket_to_server, struct.unpack('!Q', socket_to_server.recv(8))[0]).decode(
# "utf-8"))
# print(responseJson)
request = {'action': 'login', 'user': 'wufayuan', 'password': '113818'}
socket_to_server.sendall(mp.generate_request(request)) socket_to_server.sendall(mp.generate_request(request))
responseJson = JSONDecoder().decode( responseJson = JSONDecoder().decode(
mp.read_bytes(socket_to_server, struct.unpack('!Q', socket_to_server.recv(8))[0]).decode( mp.read_bytes(socket_to_server, struct.unpack('!Q', socket_to_server.recv(8))[0]).decode(
"utf-8")) "utf-8"))
cookie = responseJson['cookie']
print(responseJson) print(responseJson)
request = {'action': 'login', 'user': 'wufayuan', 'password': '113818'} request = {'action': 'report_free', 'cookie': cookie}
socket_to_server.sendall(mp.generate_request(request)) socket_to_server.sendall(mp.generate_request(request))
responseJson = JSONDecoder().decode( responseJson = JSONDecoder().decode(
mp.read_bytes(socket_to_server, struct.unpack('!Q', socket_to_server.recv(8))[0]).decode( mp.read_bytes(socket_to_server, struct.unpack('!Q', socket_to_server.recv(8))[0]).decode(
"utf-8")) "utf-8"))
cookie = responseJson['cookie']
print(responseJson) print(responseJson)
request = {'action': 'report_free', 'cookie': cookie} request = {'action': 'crawl zhiwang', 'word': 'science', 'pages_start': 1, 'pages_end': 3,
'cookie': cookie}
socket_to_server.sendall(mp.generate_request(request)) socket_to_server.sendall(mp.generate_request(request))
responseJson = JSONDecoder().decode( responseJson = JSONDecoder().decode(
mp.read_bytes(socket_to_server, struct.unpack('!Q', socket_to_server.recv(8))[0]).decode( mp.read_bytes(socket_to_server, struct.unpack('!Q', socket_to_server.recv(8))[0]).decode(
"utf-8")) "utf-8"))
print(responseJson) print(responseJson)
# request = {'action': 'crawl zhiwang', 'word': 'computer', 'pages_start': 1, 'pages_end': 10,
# 'cookie': cookie}
# socket_to_server.sendall(mp.generate_request(request))
# responseJson = JSONDecoder().decode(
# mp.read_bytes(socket_to_server, struct.unpack('!Q', socket_to_server.recv(8))[0]).decode(
# "utf-8"))
# print(responseJson)
request = {'action': 'end'} request = {'action': 'end'}
socket_to_server.sendall(mp.generate_request(request)) socket_to_server.sendall(mp.generate_request(request))

File diff suppressed because one or more lines are too long

@ -1,4 +1,3 @@
import csv
import socket import socket
import threading import threading
from typing import Optional from typing import Optional
@ -8,9 +7,9 @@ from msedge.selenium_tools import Edge
from msedge.selenium_tools import EdgeOptions from msedge.selenium_tools import EdgeOptions
from conf.config import global_var, get_free_sockets, get_crawl_result, get_by_cookie, set_state_socket from conf.config import global_var, get_free_sockets, get_crawl_result, get_by_cookie, set_state_socket
from dcs.tests.zhiwang import *
from dcs.tools.database import get_crawl_result_by_crawl_id, write_result2database from dcs.tools.database import get_crawl_result_by_crawl_id, write_result2database
from dcs.tools.database import get_last_crawl_id, create_crawl_result_table from dcs.tools.database import get_last_crawl_id, create_crawl_result_table
from dcs.tests.zhiwang import *
def write2database(paper: Paper, table_name: str, last_crawl_id: int): def write2database(paper: Paper, table_name: str, last_crawl_id: int):
@ -47,7 +46,7 @@ class Crawler(threading.Thread):
def crawl_zhiwang(self, user_name=None): def crawl_zhiwang(self, user_name=None):
edge_options = EdgeOptions() edge_options = EdgeOptions()
edge_options.use_chromium = True edge_options.use_chromium = True
# edge_options.add_argument('headless') edge_options.add_argument('--headless')
driver = Edge(options=edge_options, executable_path=r'G:\course\yykf\dcs\bin\msedgedriver.exe') driver = Edge(options=edge_options, executable_path=r'G:\course\yykf\dcs\bin\msedgedriver.exe')
soup = driver_open(driver, self.partial_task.word) # 搜索word soup = driver_open(driver, self.partial_task.word) # 搜索word
@ -130,7 +129,7 @@ class Spider_task(threading.Thread):
self.client_socket = client_socket self.client_socket = client_socket
self.request_map = request_map self.request_map = request_map
self.partial_tasks: list[Spider_partial_task] = [] self.partial_tasks: list[Spider_partial_task] = []
self.const_page = 3 self.const_page = 1
def distribute_task(self): def distribute_task(self):
# distribute tasks, 3 pages as a task # distribute tasks, 3 pages as a task

@ -6,6 +6,7 @@ import time
import requests import requests
# 定义论文类 # 定义论文类
from loguru import logger
from msedge.selenium_tools import webdriver from msedge.selenium_tools import webdriver
from msedge.selenium_tools.service import Service from msedge.selenium_tools.service import Service
from selenium.webdriver.common.by import By from selenium.webdriver.common.by import By
@ -30,21 +31,15 @@ class Author:
# 进入知网首页并搜索关键词 # 进入知网首页并搜索关键词
def driver_open(driver, key_word): def driver_open(driver, key_word):
from msedge.selenium_tools import EdgeOptions
url = "https://www.cnki.net/"
edge_options = EdgeOptions()
edge_options.use_chromium = True
# 设置无界面模式,也可以添加其它设置
edge_options.add_argument('--headless')
url = "https://www.cnki.net/" url = "https://www.cnki.net/"
driver.get(url) driver.get(url)
time.sleep(2) time.sleep(1)
driver.find_element(by=By.CSS_SELECTOR, value='#txt_SearchText').send_keys(key_word) driver.find_element(by=By.CSS_SELECTOR, value='#txt_SearchText').send_keys(key_word)
time.sleep(2) # time.sleep(2)
# 点击搜索按钮 # 点击搜索按钮
driver.find_element(by=By.CSS_SELECTOR, driver.find_element(by=By.CSS_SELECTOR,
value='body > div.wrapper.section1 > div.searchmain > div > div.input-box > input.search-btn').click() value='body > div.wrapper.section1 > div.searchmain > div > div.input-box > input.search-btn').click()
time.sleep(5) time.sleep(5) # 必须要等待
content = driver.page_source.encode('utf-8') content = driver.page_source.encode('utf-8')
# driver.close() # driver.close()
soup = BeautifulSoup(content, 'lxml') soup = BeautifulSoup(content, 'lxml')
@ -55,7 +50,8 @@ def spider(driver, soup, papers):
tbody = soup.find_all('tbody') tbody = soup.find_all('tbody')
try: try:
tbody = BeautifulSoup(str(tbody[0]), 'lxml') tbody = BeautifulSoup(str(tbody[0]), 'lxml')
except: except Exception as e:
logger.error(str(e))
return return
tr = tbody.find_all('tr') tr = tbody.find_all('tr')
for item in tr: for item in tr:
@ -86,7 +82,7 @@ def spider(driver, soup, papers):
print('\n') print('\n')
paper = Paper(title, authors) paper = Paper(title, authors)
papers.append(paper) papers.append(paper)
time.sleep(1) # 每调一次spider休息1s # time.sleep(1) # 每调一次spider休息1s
# pn表示当前要爬的页数 # pn表示当前要爬的页数

@ -159,6 +159,7 @@ def get_crawl_result_by_crawl_id(table_name: str, crawl_id: int):
def create_table(create_sql: str): def create_table(create_sql: str):
try: try:
conn = mysql_conn() conn = mysql_conn()
cur = conn.cursor() cur = conn.cursor()
cur.execute(create_sql) cur.execute(create_sql)
conn.commit() conn.commit()

Loading…
Cancel
Save