重写了爬虫，弃用selnium，爬取手机端知网，多线程爬取

3 years ago · 60f93c0f0e
parent ad427ef9bc
commit 60f93c0f0e
3 changed files with 724 additions and 2 deletions
--- a/dcs/tests/fastcrawler.py
+++ b/dcs/tests/fastcrawler.py
@ -0,0 +1,103 @@
 import threading
 from collections import deque
 import bs4
 from dcs.tests.zhiwang import *
 class Crawler:
    def __init__(self):
        self.url = 'https://kns.cnki.net/kns8/Brief/GetGridTableHtml'
        self.url = 'https://kns.cnki.net/kns8/Group/Result'
        self.url = 'https://wap.cnki.net/touch/web/Article/Search'
        self.headers = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36 Edg/103.0.1264.49'}
        self.cookies = 'Ecp_ClientId=1220704081600243712; Ecp_loginuserbk=wh0302; knsLeftGroupSelectItem=1;2;; Ecp_ClientIp=202.197.9.22; SID_sug=126002; _pk_ses=*; Ecp_IpLoginFail=220708116.162.2.165; _pk_id=6f3fe3b8-fcc4-4111-ad5f-c2f8aba3a0e3.1656893782.2.1657209667.1657209661.; ASP.NET_SessionId=uppv4o3sgpf45j1lmsc4ogin; SID_kns8=015123157; dblang=ch'
    def get_search_html(self):
        params = {
            'searchtype': 0,
            'fieldtype': 101,
            'keyword': 'computer'
        }
        res = requests.post(self.url, data=params, headers=self.headers)
        if res.status_code == 200:
            soup = bs4.BeautifulSoup(res.text, 'html.parser')
            return soup
    def get_html_by_link(self, link):
        logger.debug(link)
        res = requests.get('https:' + link, headers=self.headers)
        if res.status_code == 200:
            soup = bs4.BeautifulSoup(res.text, 'html.parser')
            return soup
    @staticmethod
    def get_paper_links(soup: bs4.BeautifulSoup):
        links = soup.find_all('a', class_='c-company-top-link')
        links_list = []
        for i in links:
            if i == 'javascript:void(0);':
                continue
            links_list.append(i.attrs['href'])
        return links_list
    def parse_paper_html(self, soup: bs4.BeautifulSoup, res):
        title = soup.find('div', class_='c-card__title2').text.strip()
        authors = soup.find('div', class_='c-card__author')
        if '\x20' in authors.text:
            res.append(Paper(title, [Author(None, None, None)]))
            return
        authors_links = [i.attrs['href'] for i in authors.find_all('a')]
        authors_list = []
        for i in authors_links:
            if i == 'javascript:void(0);':
                continue
            authors_list.append(self.parse_author_html(self.get_html_by_link(i)))
        res.append(Paper(title, authors))
    @staticmethod
    def parse_author_html(soup: bs4.BeautifulSoup):
        try:
            name = soup.find('div', class_='zz-middle-name-text').text.strip()
            college = soup.find('div', class_='zz-middle-company').text.strip()
            major = soup.find('div', class_='zz-info-chart').find('span').text.strip()
            # print(name)
            # print(college)
            # print(major)
        except AttributeError:
            name = soup.find('div', class_='c-nav__item c-nav__title').text.strip()
            college = None
            major = None
        return Author(name, college, major)
    def crawl(self):
        sh = self.get_search_html()
        pl = self.get_paper_links(sh)
        res = deque()
        threads = []
        for p in pl:
            if p == 'javascript:void(0);':
                continue
            p = self.get_html_by_link(p)
            t = threading.Thread(target=self.parse_paper_html, args=(p, res,))
            threads.append(t)
        [t.start() for t in threads]
        [t.join() for t in threads]
        return res
    @staticmethod
    def write2test(content):
        with open('test.html', 'w', encoding='utf-8') as t:
            t.write(content)
 if __name__ == '__main__':
    crawler = Crawler()
    data = crawler.crawl()
    for r in data:
        print(r)
--- a/dcs/tests/test.html
+++ b/dcs/tests/test.html
@ -0,0 +1,614 @@
 computer-手机知网
 114374
 {{bookCount}}篇
 搜索
 文献
 期刊
 图书
 工具书
 主题
 相关度
 类型
 时间
                                主题
                                篇名
                                全文
                                作者
                                单位
                                关键词
                                摘要
                                来源
                            相关度
                            下载次数
                            被引频次
                            最新文献
                            历史文献
                            全部
                            全部期刊
                                        核心期刊
                                        SCI(科学引文索引)
                                        EI(工程索引)
                                        CSSCI(中文社会科学引文索引)
                            学位论文
                                        博士论文
                                        硕士论文
                            会议论文
                            重要报纸
                                全部文献
                                近十年文献
                                近五年文献
                                近三年文献
 自定义
                            Knowledge graph for identifying geological disasters by integrating comp...                        
                            Qinjun Qiu Zhong Xie Die Zhang Kai Ma Liufeng Tao
 The occurrence of geological disasters can have a large impact on urban safety. Protecting people’s safety is the most important concern w...                        
 Journal of Earth Science   网络首发  2022.03.05
 网络首发
 SCI科学
                            下载：56    被引：0
                            Real-time gradation-expressible amplitude-modulationtype electroholography ...                        
                            Ren Noguchi Kohei Suzuki Yoshiki Moriguchi Minoru Oikawa Yuichiro Mori
 In amplitude-modulation-type electroholography, the binary-weighted computer-generated hologram(BW-CGH) facilitates the gradation-expressib...                        
 Chinese Optics Letters   2021年11期
 SCI科学
 EI工程
                            下载：14    被引：0
                            CAIE中Computer Science考试对信息技术纳入高考的启示                        
                            孔珍珍 
 介绍和分析CAIE英式课程考试中Computer Science学科的课程内容、考试制度、试题模式,借鉴其成功经验与考试制度,以期借他山之石对我国信息技术课程纳入...                        
 中国现代教育装备   2021年24期
                            下载：20    被引：0
                            A review of the research and application of deep learning-based computer...                        
                            Zhang Lingxin Shen Junkai Zhu Baijie 
 Damage detection is a key procedure in maintenance throughout structures′ life cycles and post-disaster loss assessment. Due to the comple...                        
 Earthquake Engineering and Engineering Vibration   2022年01期
 SCI科学
 EI工程
                            下载：109    被引：0
 Computer-Assisted Language Learning: the State of the Art with Specia...                        
 This paper attempts to present a critical review of the state of the art in computer-assisted language learning(CALL) with special referenc...                        
 外语电化教学   2008年02期
 CSSCI
                            下载：535    被引：5
                            Opening Moves Involved in Text-based Computer-Mediated-Communication ...                        
                            李莉华 
 The development of science and technology has made it not only possible but very convenient for people living in different parts of the wor...                        
 海外英语   2011年03期
                            下载：69    被引：0
                            Emotion recognition for human-computer interaction                        
                            Jianhua TAO 
 <正>Emotion recognition is to quantify, describe and recognize different emotional states through the behavioral and physiological response...                        
 虚拟现实与智能硬件(中英文)   2021年01期
                            下载：17    被引：0
                            A lightweight data-voting strategy for triple-modular redundant control ...                        
                            LIU Bo YANG MengFei WANG Yong YUAN Li LIU ChaoWei
 Triple-modular redundancy(TMR), a well-known methodology for improving the reliability of computer systems, has been used for onboard contr...                        
 Science China(Technological Sciences)   2022年02期
 SCI科学
 EI工程
                            下载：34    被引：0
 Computer generation of detailed reaction networks in hydrocracking of...                        
                            Jingjing Wang Wei Zhao Kunpeng Song Hongwei Xiang Liping Zhou
 Fischer-Tropsch synthesis(FTS) wax is a mixture of linear hydrocarbons with carbon number from C7 to C70+.Converting FTS wax into high-qual...                        
 Chinese Journal of Chemical Engineering   2022年01期
 SCI科学
 EI工程
                            下载：6    被引：0
                            Transfer Learning Algorithm Design for Feature Transfer Problem in Motor Im...                        
                            Yu Zhang Huaqing Li Heng Dong Zheng Dai Xing Chen
 The non-stationary of the motor imagery electroencephalography(MI-EEG) signal is one of the main limitations for the development of motor i...                        
 China Communications   2022年02期
 SCI科学
                            下载：31    被引：0
 点击加载更多 
                筛选
                        学科
                        研究层次
                        机构
                        主办单位
                        作者
                        导师
                    重置
                    完成
 文献
 期刊
 工具书
 图书
 文集
 首页
 搜索
 客户端
 充值
 帮助
        违法和不良信息举报电话：400-062-8866
        举报邮箱：jubao@cnki.net
        ©2022中国知网(CNKI)
--- a/dcs/tests/zhiwang.py
+++ b/dcs/tests/zhiwang.py
@ -15,7 +15,10 @@ class Paper:
        self.authors = authors
    def __str__(self):
-        return f'{self.title}, authors'
+        s = f'title: {self.title}\n'
        for i in self.authors:
            s += f'author: {i}\n'
        return s
 # 定义作者类
@ -25,6 +28,9 @@ class Author:
        self.college = college
        self.major = major
    def __str__(self):
        return f'{self.name}, {self.college}, {self.major}'
 # 进入知网首页并搜索关键词
 def driver_open(driver, key_word):
@ -80,7 +86,6 @@ def spider(driver, soup, papers):
        # print('\n')
        paper = Paper(title, authors)
        papers.append(paper)
        papers.append(paper)
        # break  # TODO: this is to shorten time of crawling
        # time.sleep(1)  # 每调一次spider休息1s