diff --git a/dcs/tests/fastcrawler.py b/dcs/tests/fastcrawler.py new file mode 100644 index 0000000..76d4897 --- /dev/null +++ b/dcs/tests/fastcrawler.py @@ -0,0 +1,103 @@ +import threading +from collections import deque + +import bs4 + +from dcs.tests.zhiwang import * + + +class Crawler: + def __init__(self): + self.url = 'https://kns.cnki.net/kns8/Brief/GetGridTableHtml' + self.url = 'https://kns.cnki.net/kns8/Group/Result' + self.url = 'https://wap.cnki.net/touch/web/Article/Search' + self.headers = { + 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36 Edg/103.0.1264.49'} + self.cookies = 'Ecp_ClientId=1220704081600243712; Ecp_loginuserbk=wh0302; knsLeftGroupSelectItem=1;2;; Ecp_ClientIp=202.197.9.22; SID_sug=126002; _pk_ses=*; Ecp_IpLoginFail=220708116.162.2.165; _pk_id=6f3fe3b8-fcc4-4111-ad5f-c2f8aba3a0e3.1656893782.2.1657209667.1657209661.; ASP.NET_SessionId=uppv4o3sgpf45j1lmsc4ogin; SID_kns8=015123157; dblang=ch' + + def get_search_html(self): + params = { + 'searchtype': 0, + 'fieldtype': 101, + 'keyword': 'computer' + } + + res = requests.post(self.url, data=params, headers=self.headers) + if res.status_code == 200: + soup = bs4.BeautifulSoup(res.text, 'html.parser') + return soup + + def get_html_by_link(self, link): + logger.debug(link) + res = requests.get('https:' + link, headers=self.headers) + if res.status_code == 200: + soup = bs4.BeautifulSoup(res.text, 'html.parser') + return soup + + @staticmethod + def get_paper_links(soup: bs4.BeautifulSoup): + links = soup.find_all('a', class_='c-company-top-link') + links_list = [] + for i in links: + if i == 'javascript:void(0);': + continue + links_list.append(i.attrs['href']) + return links_list + + def parse_paper_html(self, soup: bs4.BeautifulSoup, res): + title = soup.find('div', class_='c-card__title2').text.strip() + + authors = soup.find('div', class_='c-card__author') + if '\x20' in authors.text: + res.append(Paper(title, [Author(None, None, None)])) + return + authors_links = [i.attrs['href'] for i in authors.find_all('a')] + authors_list = [] + for i in authors_links: + if i == 'javascript:void(0);': + continue + authors_list.append(self.parse_author_html(self.get_html_by_link(i))) + + res.append(Paper(title, authors)) + + @staticmethod + def parse_author_html(soup: bs4.BeautifulSoup): + try: + name = soup.find('div', class_='zz-middle-name-text').text.strip() + college = soup.find('div', class_='zz-middle-company').text.strip() + major = soup.find('div', class_='zz-info-chart').find('span').text.strip() + # print(name) + # print(college) + # print(major) + except AttributeError: + name = soup.find('div', class_='c-nav__item c-nav__title').text.strip() + college = None + major = None + return Author(name, college, major) + + def crawl(self): + sh = self.get_search_html() + pl = self.get_paper_links(sh) + res = deque() + threads = [] + for p in pl: + if p == 'javascript:void(0);': + continue + p = self.get_html_by_link(p) + t = threading.Thread(target=self.parse_paper_html, args=(p, res,)) + threads.append(t) + [t.start() for t in threads] + [t.join() for t in threads] + return res + + @staticmethod + def write2test(content): + with open('test.html', 'w', encoding='utf-8') as t: + t.write(content) + + +if __name__ == '__main__': + crawler = Crawler() + data = crawler.crawl() + for r in data: + print(r) diff --git a/dcs/tests/test.html b/dcs/tests/test.html new file mode 100644 index 0000000..210fae0 --- /dev/null +++ b/dcs/tests/test.html @@ -0,0 +1,614 @@ + + + + + + + + +computer-手机知网 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +114374 + + + + + + + + + + + + + + +{{bookCount}}篇 + + + +搜索 + + + + +文献 +期刊 +图书 +工具书 + + + + + + + + + + + + +主题 + + + +相关度 + + + +类型 + + + +时间 + + + + + + + + + 主题 + + + 篇名 + + + 全文 + + + 作者 + + + 单位 + + + 关键词 + + + 摘要 + + + 来源 + + + + + + + + 相关度 + + + 下载次数 + + + 被引频次 + + + 最新文献 + + + 历史文献 + + + + + + + + 全部 + + + 全部期刊 + + + + 核心期刊 + + + SCI(科学引文索引) + + + EI(工程索引) + + + CSSCI(中文社会科学引文索引) + + + + 学位论文 + + + + 博士论文 + + + 硕士论文 + + + + 会议论文 + + + 重要报纸 + + + + + + + + 全部文献 + + + 近十年文献 + + + 近五年文献 + + + 近三年文献 + +自定义 + + + + + + + + + + + + + + + + Knowledge graph for identifying geological disasters by integrating comp... + + + Qinjun Qiu Zhong Xie Die Zhang Kai Ma Liufeng Tao + + +The occurrence of geological disasters can have a large impact on urban safety. Protecting people’s safety is the most important concern w... + +Journal of Earth Science   网络首发  2022.03.05 +网络首发 +SCI科学 + + + + 下载:56    被引:0 + + + + + + + Real-time gradation-expressible amplitude-modulationtype electroholography ... + + + Ren Noguchi Kohei Suzuki Yoshiki Moriguchi Minoru Oikawa Yuichiro Mori + + +In amplitude-modulation-type electroholography, the binary-weighted computer-generated hologram(BW-CGH) facilitates the gradation-expressib... + +Chinese Optics Letters   2021年11期 +SCI科学 +EI工程 + + + + 下载:14    被引:0 + + + + + + + CAIE中Computer Science考试对信息技术纳入高考的启示 + + + 孔珍珍 + + +介绍和分析CAIE英式课程考试中Computer Science学科的课程内容、考试制度、试题模式,借鉴其成功经验与考试制度,以期借他山之石对我国信息技术课程纳入... + +中国现代教育装备   2021年24期 + + + + 下载:20    被引:0 + + + + + + + A review of the research and application of deep learning-based computer... + + + Zhang Lingxin Shen Junkai Zhu Baijie + + +Damage detection is a key procedure in maintenance throughout structures′ life cycles and post-disaster loss assessment. Due to the comple... + +Earthquake Engineering and Engineering Vibration   2022年01期 +SCI科学 +EI工程 + + + + 下载:109    被引:0 + + + + + + +Computer-Assisted Language Learning: the State of the Art with Specia... + + + +This paper attempts to present a critical review of the state of the art in computer-assisted language learning(CALL) with special referenc... + +外语电化教学   2008年02期 +CSSCI + + + + 下载:535    被引:5 + + + + + + + Opening Moves Involved in Text-based Computer-Mediated-Communication ... + + + 李莉华 + + +The development of science and technology has made it not only possible but very convenient for people living in different parts of the wor... + +海外英语   2011年03期 + + + + 下载:69    被引:0 + + + + + + + Emotion recognition for human-computer interaction + + + Jianhua TAO + + +<正>Emotion recognition is to quantify, describe and recognize different emotional states through the behavioral and physiological response... + +虚拟现实与智能硬件(中英文)   2021年01期 + + + + 下载:17    被引:0 + + + + + + + A lightweight data-voting strategy for triple-modular redundant control ... + + + LIU Bo YANG MengFei WANG Yong YUAN Li LIU ChaoWei + + +Triple-modular redundancy(TMR), a well-known methodology for improving the reliability of computer systems, has been used for onboard contr... + +Science China(Technological Sciences)   2022年02期 +SCI科学 +EI工程 + + + + 下载:34    被引:0 + + + + + + +Computer generation of detailed reaction networks in hydrocracking of... + + + Jingjing Wang Wei Zhao Kunpeng Song Hongwei Xiang Liping Zhou + + +Fischer-Tropsch synthesis(FTS) wax is a mixture of linear hydrocarbons with carbon number from C7 to C70+.Converting FTS wax into high-qual... + +Chinese Journal of Chemical Engineering   2022年01期 +SCI科学 +EI工程 + + + + 下载:6    被引:0 + + + + + + + Transfer Learning Algorithm Design for Feature Transfer Problem in Motor Im... + + + Yu Zhang Huaqing Li Heng Dong Zheng Dai Xing Chen + + +The non-stationary of the motor imagery electroencephalography(MI-EEG) signal is one of the main limitations for the development of motor i... + +China Communications   2022年02期 +SCI科学 + + + + 下载:31    被引:0 + + + + +点击加载更多 + + + + + + + + + + + + 筛选 + + + + + + + 学科 + + + + + + + + + + + + + 研究层次 + + + + + + + + + + + + 机构 + + + + + + + + + + + + 主办单位 + + + + + + + + + + + + 作者 + + + + + + + + + + + + 导师 + + + + + + + + + + + + 重置 + + + 完成 + + + + + + + + + + + + + + + + + + +文献 +期刊 +工具书 +图书 +文集 + + +首页 +搜索 +客户端 +充值 +帮助 + + + 违法和不良信息举报电话:400-062-8866 + 举报邮箱:jubao@cnki.net + + + ©2022中国知网(CNKI) + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dcs/tests/zhiwang.py b/dcs/tests/zhiwang.py index 3c0eab7..ebb47d6 100644 --- a/dcs/tests/zhiwang.py +++ b/dcs/tests/zhiwang.py @@ -15,7 +15,10 @@ class Paper: self.authors = authors def __str__(self): - return f'{self.title}, authors' + s = f'title: {self.title}\n' + for i in self.authors: + s += f'author: {i}\n' + return s # 定义作者类 @@ -25,6 +28,9 @@ class Author: self.college = college self.major = major + def __str__(self): + return f'{self.name}, {self.college}, {self.major}' + # 进入知网首页并搜索关键词 def driver_open(driver, key_word): @@ -80,7 +86,6 @@ def spider(driver, soup, papers): # print('\n') paper = Paper(title, authors) papers.append(paper) - papers.append(paper) # break # TODO: this is to shorten time of crawling # time.sleep(1) # 每调一次spider休息1s