重写了爬虫,弃用selnium,爬取手机端知网,多线程爬取

master
wufayuan 2 years ago
parent ad427ef9bc
commit 60f93c0f0e

@ -0,0 +1,103 @@
import threading
from collections import deque
import bs4
from dcs.tests.zhiwang import *
class Crawler:
def __init__(self):
self.url = 'https://kns.cnki.net/kns8/Brief/GetGridTableHtml'
self.url = 'https://kns.cnki.net/kns8/Group/Result'
self.url = 'https://wap.cnki.net/touch/web/Article/Search'
self.headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36 Edg/103.0.1264.49'}
self.cookies = 'Ecp_ClientId=1220704081600243712; Ecp_loginuserbk=wh0302; knsLeftGroupSelectItem=1;2;; Ecp_ClientIp=202.197.9.22; SID_sug=126002; _pk_ses=*; Ecp_IpLoginFail=220708116.162.2.165; _pk_id=6f3fe3b8-fcc4-4111-ad5f-c2f8aba3a0e3.1656893782.2.1657209667.1657209661.; ASP.NET_SessionId=uppv4o3sgpf45j1lmsc4ogin; SID_kns8=015123157; dblang=ch'
def get_search_html(self):
params = {
'searchtype': 0,
'fieldtype': 101,
'keyword': 'computer'
}
res = requests.post(self.url, data=params, headers=self.headers)
if res.status_code == 200:
soup = bs4.BeautifulSoup(res.text, 'html.parser')
return soup
def get_html_by_link(self, link):
logger.debug(link)
res = requests.get('https:' + link, headers=self.headers)
if res.status_code == 200:
soup = bs4.BeautifulSoup(res.text, 'html.parser')
return soup
@staticmethod
def get_paper_links(soup: bs4.BeautifulSoup):
links = soup.find_all('a', class_='c-company-top-link')
links_list = []
for i in links:
if i == 'javascript:void(0);':
continue
links_list.append(i.attrs['href'])
return links_list
def parse_paper_html(self, soup: bs4.BeautifulSoup, res):
title = soup.find('div', class_='c-card__title2').text.strip()
authors = soup.find('div', class_='c-card__author')
if '\x20' in authors.text:
res.append(Paper(title, [Author(None, None, None)]))
return
authors_links = [i.attrs['href'] for i in authors.find_all('a')]
authors_list = []
for i in authors_links:
if i == 'javascript:void(0);':
continue
authors_list.append(self.parse_author_html(self.get_html_by_link(i)))
res.append(Paper(title, authors))
@staticmethod
def parse_author_html(soup: bs4.BeautifulSoup):
try:
name = soup.find('div', class_='zz-middle-name-text').text.strip()
college = soup.find('div', class_='zz-middle-company').text.strip()
major = soup.find('div', class_='zz-info-chart').find('span').text.strip()
# print(name)
# print(college)
# print(major)
except AttributeError:
name = soup.find('div', class_='c-nav__item c-nav__title').text.strip()
college = None
major = None
return Author(name, college, major)
def crawl(self):
sh = self.get_search_html()
pl = self.get_paper_links(sh)
res = deque()
threads = []
for p in pl:
if p == 'javascript:void(0);':
continue
p = self.get_html_by_link(p)
t = threading.Thread(target=self.parse_paper_html, args=(p, res,))
threads.append(t)
[t.start() for t in threads]
[t.join() for t in threads]
return res
@staticmethod
def write2test(content):
with open('test.html', 'w', encoding='utf-8') as t:
t.write(content)
if __name__ == '__main__':
crawler = Crawler()
data = crawler.crawl()
for r in data:
print(r)

@ -0,0 +1,614 @@
computer-手机知网
114374
{{bookCount}}篇
搜索
文献
期刊
图书
工具书
主题
相关度
类型
时间
主题
篇名
全文
作者
单位
关键词
摘要
来源
相关度
下载次数
被引频次
最新文献
历史文献
全部
全部期刊
核心期刊
SCI(科学引文索引)
EI(工程索引)
CSSCI(中文社会科学引文索引)
学位论文
博士论文
硕士论文
会议论文
重要报纸
全部文献
近十年文献
近五年文献
近三年文献
自定义
Knowledge graph for identifying geological disasters by integrating comp...
Qinjun Qiu Zhong Xie Die Zhang Kai Ma Liufeng Tao
The occurrence of geological disasters can have a large impact on urban safety. Protecting peoples safety is the most important concern w...
Journal of Earth Science   网络首发  2022.03.05
网络首发
SCI科学
下载56    被引0
Real-time gradation-expressible amplitude-modulationtype electroholography ...
Ren Noguchi Kohei Suzuki Yoshiki Moriguchi Minoru Oikawa Yuichiro Mori
In amplitude-modulation-type electroholography, the binary-weighted computer-generated hologram(BW-CGH) facilitates the gradation-expressib...
Chinese Optics Letters   2021年11期
SCI科学
EI工程
下载14    被引0
CAIE中Computer Science考试对信息技术纳入高考的启示
孔珍珍
介绍和分析CAIE英式课程考试中Computer Science学科的课程内容、考试制度、试题模式,借鉴其成功经验与考试制度,以期借他山之石对我国信息技术课程纳入...
中国现代教育装备   2021年24期
下载20    被引0
A review of the research and application of deep learning-based computer...
Zhang Lingxin Shen Junkai Zhu Baijie
Damage detection is a key procedure in maintenance throughout structures life cycles and post-disaster loss assessment. Due to the comple...
Earthquake Engineering and Engineering Vibration   2022年01期
SCI科学
EI工程
下载109    被引0
Computer-Assisted Language Learning: the State of the Art with Specia...
This paper attempts to present a critical review of the state of the art in computer-assisted language learning(CALL) with special referenc...
外语电化教学   2008年02期
CSSCI
下载535    被引5
Opening Moves Involved in Text-based Computer-Mediated-Communication ...
李莉华
The development of science and technology has made it not only possible but very convenient for people living in different parts of the wor...
海外英语   2011年03期
下载69    被引0
Emotion recognition for human-computer interaction
Jianhua TAO
<正>Emotion recognition is to quantify, describe and recognize different emotional states through the behavioral and physiological response...
虚拟现实与智能硬件(中英文)   2021年01期
下载17    被引0
A lightweight data-voting strategy for triple-modular redundant control ...
LIU Bo YANG MengFei WANG Yong YUAN Li LIU ChaoWei
Triple-modular redundancy(TMR), a well-known methodology for improving the reliability of computer systems, has been used for onboard contr...
Science China(Technological Sciences)   2022年02期
SCI科学
EI工程
下载34    被引0
Computer generation of detailed reaction networks in hydrocracking of...
Jingjing Wang Wei Zhao Kunpeng Song Hongwei Xiang Liping Zhou
Fischer-Tropsch synthesis(FTS) wax is a mixture of linear hydrocarbons with carbon number from C7 to C70+.Converting FTS wax into high-qual...
Chinese Journal of Chemical Engineering   2022年01期
SCI科学
EI工程
下载6    被引0
Transfer Learning Algorithm Design for Feature Transfer Problem in Motor Im...
Yu Zhang Huaqing Li Heng Dong Zheng Dai Xing Chen
The non-stationary of the motor imagery electroencephalography(MI-EEG) signal is one of the main limitations for the development of motor i...
China Communications   2022年02期
SCI科学
下载31    被引0
点击加载更多
筛选
学科
研究层次
机构
主办单位
作者
导师
重置
完成
文献
期刊
工具书
图书
文集
首页
搜索
客户端
充值
帮助
违法和不良信息举报电话400-062-8866
举报邮箱jubao@cnki.net
©2022中国知网(CNKI)

@ -15,7 +15,10 @@ class Paper:
self.authors = authors
def __str__(self):
return f'{self.title}, authors'
s = f'title: {self.title}\n'
for i in self.authors:
s += f'author: {i}\n'
return s
# 定义作者类
@ -25,6 +28,9 @@ class Author:
self.college = college
self.major = major
def __str__(self):
return f'{self.name}, {self.college}, {self.major}'
# 进入知网首页并搜索关键词
def driver_open(driver, key_word):
@ -80,7 +86,6 @@ def spider(driver, soup, papers):
# print('\n')
paper = Paper(title, authors)
papers.append(paper)
papers.append(paper)
# break # TODO: this is to shorten time of crawling
# time.sleep(1) # 每调一次spider休息1s

Loading…
Cancel
Save