重写了爬虫，弃用selnium，爬取手机端知网，多线程爬取

2 years ago · 60f93c0f0e
parent ad427ef9bc
commit 60f93c0f0e
3 changed files with 724 additions and 2 deletions
--- a/dcs/tests/fastcrawler.py
+++ b/dcs/tests/fastcrawler.py
@ -0,0 +1,103 @@
+import threading
+from collections import deque
+
+import bs4
+
+from dcs.tests.zhiwang import *
+
+
+class Crawler:
+    def __init__(self):
+        self.url = 'https://kns.cnki.net/kns8/Brief/GetGridTableHtml'
+        self.url = 'https://kns.cnki.net/kns8/Group/Result'
+        self.url = 'https://wap.cnki.net/touch/web/Article/Search'
+        self.headers = {
+            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36 Edg/103.0.1264.49'}
+        self.cookies = 'Ecp_ClientId=1220704081600243712; Ecp_loginuserbk=wh0302; knsLeftGroupSelectItem=1;2;; Ecp_ClientIp=202.197.9.22; SID_sug=126002; _pk_ses=*; Ecp_IpLoginFail=220708116.162.2.165; _pk_id=6f3fe3b8-fcc4-4111-ad5f-c2f8aba3a0e3.1656893782.2.1657209667.1657209661.; ASP.NET_SessionId=uppv4o3sgpf45j1lmsc4ogin; SID_kns8=015123157; dblang=ch'
+
+    def get_search_html(self):
+        params = {
+            'searchtype': 0,
+            'fieldtype': 101,
+            'keyword': 'computer'
+        }
+
+        res = requests.post(self.url, data=params, headers=self.headers)
+        if res.status_code == 200:
+            soup = bs4.BeautifulSoup(res.text, 'html.parser')
+            return soup
+
+    def get_html_by_link(self, link):
+        logger.debug(link)
+        res = requests.get('https:' + link, headers=self.headers)
+        if res.status_code == 200:
+            soup = bs4.BeautifulSoup(res.text, 'html.parser')
+            return soup
+
+    @staticmethod
+    def get_paper_links(soup: bs4.BeautifulSoup):
+        links = soup.find_all('a', class_='c-company-top-link')
+        links_list = []
+        for i in links:
+            if i == 'javascript:void(0);':
+                continue
+            links_list.append(i.attrs['href'])
+        return links_list
+
+    def parse_paper_html(self, soup: bs4.BeautifulSoup, res):
+        title = soup.find('div', class_='c-card__title2').text.strip()
+
+        authors = soup.find('div', class_='c-card__author')
+        if '\x20' in authors.text:
+            res.append(Paper(title, [Author(None, None, None)]))
+            return
+        authors_links = [i.attrs['href'] for i in authors.find_all('a')]
+        authors_list = []
+        for i in authors_links:
+            if i == 'javascript:void(0);':
+                continue
+            authors_list.append(self.parse_author_html(self.get_html_by_link(i)))
+
+        res.append(Paper(title, authors))
+
+    @staticmethod
+    def parse_author_html(soup: bs4.BeautifulSoup):
+        try:
+            name = soup.find('div', class_='zz-middle-name-text').text.strip()
+            college = soup.find('div', class_='zz-middle-company').text.strip()
+            major = soup.find('div', class_='zz-info-chart').find('span').text.strip()
+            # print(name)
+            # print(college)
+            # print(major)
+        except AttributeError:
+            name = soup.find('div', class_='c-nav__item c-nav__title').text.strip()
+            college = None
+            major = None
+        return Author(name, college, major)
+
+    def crawl(self):
+        sh = self.get_search_html()
+        pl = self.get_paper_links(sh)
+        res = deque()
+        threads = []
+        for p in pl:
+            if p == 'javascript:void(0);':
+                continue
+            p = self.get_html_by_link(p)
+            t = threading.Thread(target=self.parse_paper_html, args=(p, res,))
+            threads.append(t)
+        [t.start() for t in threads]
+        [t.join() for t in threads]
+        return res
+
+    @staticmethod
+    def write2test(content):
+        with open('test.html', 'w', encoding='utf-8') as t:
+            t.write(content)
+
+
+if __name__ == '__main__':
+    crawler = Crawler()
+    data = crawler.crawl()
+    for r in data:
+        print(r)
--- a/dcs/tests/test.html
+++ b/dcs/tests/test.html
@ -0,0 +1,614 @@
+
+
+
+
+
+
+
+
+computer-手机知网
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+114374
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+{{bookCount}}篇
+
+
+
+搜索
+
+
+
+
+文献
+期刊
+图书
+工具书
+
+
+
+
+
+
+
+
+
+
+
+
+主题
+
+
+
+相关度
+
+
+
+类型
+
+
+
+时间
+
+
+
+ 
+
+
+
+
+                                主题
+                            
+
+                                篇名
+                            
+
+                                全文
+                            
+
+                                作者
+                            
+
+                                单位
+                            
+
+                                关键词
+                            
+
+                                摘要
+                            
+
+                                来源
+                            
+
+
+
+
+
+
+                            相关度
+                        
+
+                            下载次数
+                        
+
+                            被引频次
+                        
+
+                            最新文献
+                        
+
+                            历史文献
+                        
+
+
+
+
+
+
+                            全部
+                        
+
+                            全部期刊
+                        
+
+
+                                        核心期刊
+                                    
+
+                                        SCI(科学引文索引)
+                                    
+
+                                        EI(工程索引)
+                                    
+
+                                        CSSCI(中文社会科学引文索引)
+                                    
+
+
+                            学位论文
+                        
+
+
+                                        博士论文
+                                    
+
+                                        硕士论文
+                                    
+
+
+                            会议论文
+                        
+
+                            重要报纸
+                        
+
+
+
+
+
+
+                                全部文献
+                            
+
+                                近十年文献
+                            
+
+                                近五年文献
+                            
+
+                                近三年文献
+                            
+自定义
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+                            Knowledge graph for identifying geological disasters by integrating comp...                        
+
+
+                            Qinjun Qiu Zhong Xie Die Zhang Kai Ma Liufeng Tao
+                        
+
+The occurrence of geological disasters can have a large impact on urban safety. Protecting people’s safety is the most important concern w...                        
+
+Journal of Earth Science   网络首发  2022.03.05
+网络首发
+SCI科学
+
+
+
+                            下载：56    被引：0
+                            
+
+
+
+
+
+                            Real-time gradation-expressible amplitude-modulationtype electroholography ...                        
+
+
+                            Ren Noguchi Kohei Suzuki Yoshiki Moriguchi Minoru Oikawa Yuichiro Mori
+                        
+
+In amplitude-modulation-type electroholography, the binary-weighted computer-generated hologram(BW-CGH) facilitates the gradation-expressib...                        
+
+Chinese Optics Letters   2021年11期
+SCI科学
+EI工程
+
+
+
+                            下载：14    被引：0
+                            
+
+
+
+
+
+                            CAIE中Computer Science考试对信息技术纳入高考的启示                        
+
+
+                            孔珍珍 
+                        
+
+介绍和分析CAIE英式课程考试中Computer Science学科的课程内容、考试制度、试题模式,借鉴其成功经验与考试制度,以期借他山之石对我国信息技术课程纳入...                        
+
+中国现代教育装备   2021年24期
+
+
+
+                            下载：20    被引：0
+                            
+
+
+
+
+
+                            A review of the research and application of deep learning-based computer...                        
+
+
+                            Zhang Lingxin Shen Junkai Zhu Baijie 
+                        
+
+Damage detection is a key procedure in maintenance throughout structures′ life cycles and post-disaster loss assessment. Due to the comple...                        
+
+Earthquake Engineering and Engineering Vibration   2022年01期
+SCI科学
+EI工程
+
+
+
+                            下载：109    被引：0
+                            
+
+
+
+
+
+Computer-Assisted Language Learning: the State of the Art with Specia...                        
+
+
+
+This paper attempts to present a critical review of the state of the art in computer-assisted language learning(CALL) with special referenc...                        
+
+外语电化教学   2008年02期
+CSSCI
+
+
+
+                            下载：535    被引：5
+                            
+
+
+
+
+
+                            Opening Moves Involved in Text-based Computer-Mediated-Communication ...                        
+
+
+                            李莉华 
+                        
+
+The development of science and technology has made it not only possible but very convenient for people living in different parts of the wor...                        
+
+海外英语   2011年03期
+
+
+
+                            下载：69    被引：0
+                            
+
+
+
+
+
+                            Emotion recognition for human-computer interaction                        
+
+
+                            Jianhua TAO 
+                        
+
+<正>Emotion recognition is to quantify, describe and recognize different emotional states through the behavioral and physiological response...                        
+
+虚拟现实与智能硬件(中英文)   2021年01期
+
+
+
+                            下载：17    被引：0
+                            
+
+
+
+
+
+                            A lightweight data-voting strategy for triple-modular redundant control ...                        
+
+
+                            LIU Bo YANG MengFei WANG Yong YUAN Li LIU ChaoWei
+                        
+
+Triple-modular redundancy(TMR), a well-known methodology for improving the reliability of computer systems, has been used for onboard contr...                        
+
+Science China(Technological Sciences)   2022年02期
+SCI科学
+EI工程
+
+
+
+                            下载：34    被引：0
+                            
+
+
+
+
+
+Computer generation of detailed reaction networks in hydrocracking of...                        
+
+
+                            Jingjing Wang Wei Zhao Kunpeng Song Hongwei Xiang Liping Zhou
+                        
+
+Fischer-Tropsch synthesis(FTS) wax is a mixture of linear hydrocarbons with carbon number from C7 to C70+.Converting FTS wax into high-qual...                        
+
+Chinese Journal of Chemical Engineering   2022年01期
+SCI科学
+EI工程
+
+
+
+                            下载：6    被引：0
+                            
+
+
+
+
+
+                            Transfer Learning Algorithm Design for Feature Transfer Problem in Motor Im...                        
+
+
+                            Yu Zhang Huaqing Li Heng Dong Zheng Dai Xing Chen
+                        
+
+The non-stationary of the motor imagery electroencephalography(MI-EEG) signal is one of the main limitations for the development of motor i...                        
+
+China Communications   2022年02期
+SCI科学
+
+
+
+                            下载：31    被引：0
+                            
+
+
+
+点击加载更多 
+
+
+
+
+
+
+
+
+
+
+
+                筛选
+            
+
+
+
+
+
+                        学科
+                    
+
+
+
+
+
+
+
+
+
+
+
+                        研究层次
+                    
+
+
+
+
+
+
+
+
+
+
+                        机构
+                    
+
+
+
+
+
+
+
+
+
+
+                        主办单位
+                    
+
+
+
+
+
+
+
+
+
+
+                        作者
+                    
+
+
+
+
+
+
+
+
+
+
+                        导师
+                    
+
+
+
+
+
+
+
+
+
+
+                    重置
+                
+
+                    完成
+                
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+文献
+期刊
+工具书
+图书
+文集
+
+
+首页
+搜索
+客户端
+充值
+帮助
+
+
+        违法和不良信息举报电话：400-062-8866
+        举报邮箱：jubao@cnki.net
+    
+
+        ©2022中国知网(CNKI)
+    
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
--- a/dcs/tests/zhiwang.py
+++ b/dcs/tests/zhiwang.py
@ -15,7 +15,10 @@ class Paper:
        self.authors = authors

    def __str__(self):
-        return f'{self.title}, authors'
+        s = f'title: {self.title}\n'
+        for i in self.authors:
+            s += f'author: {i}\n'
+        return s


 # 定义作者类
@ -25,6 +28,9 @@ class Author:
        self.college = college
        self.major = major

+    def __str__(self):
+        return f'{self.name}, {self.college}, {self.major}'
+

 # 进入知网首页并搜索关键词
 def driver_open(driver, key_word):
@ -80,7 +86,6 @@ def spider(driver, soup, papers):
        # print('\n')
        paper = Paper(title, authors)
        papers.append(paper)
-        papers.append(paper)
        # break  # TODO: this is to shorten time of crawling
        # time.sleep(1)  # 每调一次spider休息1s