parent
ad427ef9bc
commit
60f93c0f0e
@ -0,0 +1,103 @@
|
|||||||
|
import threading
|
||||||
|
from collections import deque
|
||||||
|
|
||||||
|
import bs4
|
||||||
|
|
||||||
|
from dcs.tests.zhiwang import *
|
||||||
|
|
||||||
|
|
||||||
|
class Crawler:
|
||||||
|
def __init__(self):
|
||||||
|
self.url = 'https://kns.cnki.net/kns8/Brief/GetGridTableHtml'
|
||||||
|
self.url = 'https://kns.cnki.net/kns8/Group/Result'
|
||||||
|
self.url = 'https://wap.cnki.net/touch/web/Article/Search'
|
||||||
|
self.headers = {
|
||||||
|
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36 Edg/103.0.1264.49'}
|
||||||
|
self.cookies = 'Ecp_ClientId=1220704081600243712; Ecp_loginuserbk=wh0302; knsLeftGroupSelectItem=1;2;; Ecp_ClientIp=202.197.9.22; SID_sug=126002; _pk_ses=*; Ecp_IpLoginFail=220708116.162.2.165; _pk_id=6f3fe3b8-fcc4-4111-ad5f-c2f8aba3a0e3.1656893782.2.1657209667.1657209661.; ASP.NET_SessionId=uppv4o3sgpf45j1lmsc4ogin; SID_kns8=015123157; dblang=ch'
|
||||||
|
|
||||||
|
def get_search_html(self):
|
||||||
|
params = {
|
||||||
|
'searchtype': 0,
|
||||||
|
'fieldtype': 101,
|
||||||
|
'keyword': 'computer'
|
||||||
|
}
|
||||||
|
|
||||||
|
res = requests.post(self.url, data=params, headers=self.headers)
|
||||||
|
if res.status_code == 200:
|
||||||
|
soup = bs4.BeautifulSoup(res.text, 'html.parser')
|
||||||
|
return soup
|
||||||
|
|
||||||
|
def get_html_by_link(self, link):
|
||||||
|
logger.debug(link)
|
||||||
|
res = requests.get('https:' + link, headers=self.headers)
|
||||||
|
if res.status_code == 200:
|
||||||
|
soup = bs4.BeautifulSoup(res.text, 'html.parser')
|
||||||
|
return soup
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_paper_links(soup: bs4.BeautifulSoup):
|
||||||
|
links = soup.find_all('a', class_='c-company-top-link')
|
||||||
|
links_list = []
|
||||||
|
for i in links:
|
||||||
|
if i == 'javascript:void(0);':
|
||||||
|
continue
|
||||||
|
links_list.append(i.attrs['href'])
|
||||||
|
return links_list
|
||||||
|
|
||||||
|
def parse_paper_html(self, soup: bs4.BeautifulSoup, res):
|
||||||
|
title = soup.find('div', class_='c-card__title2').text.strip()
|
||||||
|
|
||||||
|
authors = soup.find('div', class_='c-card__author')
|
||||||
|
if '\x20' in authors.text:
|
||||||
|
res.append(Paper(title, [Author(None, None, None)]))
|
||||||
|
return
|
||||||
|
authors_links = [i.attrs['href'] for i in authors.find_all('a')]
|
||||||
|
authors_list = []
|
||||||
|
for i in authors_links:
|
||||||
|
if i == 'javascript:void(0);':
|
||||||
|
continue
|
||||||
|
authors_list.append(self.parse_author_html(self.get_html_by_link(i)))
|
||||||
|
|
||||||
|
res.append(Paper(title, authors))
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def parse_author_html(soup: bs4.BeautifulSoup):
|
||||||
|
try:
|
||||||
|
name = soup.find('div', class_='zz-middle-name-text').text.strip()
|
||||||
|
college = soup.find('div', class_='zz-middle-company').text.strip()
|
||||||
|
major = soup.find('div', class_='zz-info-chart').find('span').text.strip()
|
||||||
|
# print(name)
|
||||||
|
# print(college)
|
||||||
|
# print(major)
|
||||||
|
except AttributeError:
|
||||||
|
name = soup.find('div', class_='c-nav__item c-nav__title').text.strip()
|
||||||
|
college = None
|
||||||
|
major = None
|
||||||
|
return Author(name, college, major)
|
||||||
|
|
||||||
|
def crawl(self):
|
||||||
|
sh = self.get_search_html()
|
||||||
|
pl = self.get_paper_links(sh)
|
||||||
|
res = deque()
|
||||||
|
threads = []
|
||||||
|
for p in pl:
|
||||||
|
if p == 'javascript:void(0);':
|
||||||
|
continue
|
||||||
|
p = self.get_html_by_link(p)
|
||||||
|
t = threading.Thread(target=self.parse_paper_html, args=(p, res,))
|
||||||
|
threads.append(t)
|
||||||
|
[t.start() for t in threads]
|
||||||
|
[t.join() for t in threads]
|
||||||
|
return res
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def write2test(content):
|
||||||
|
with open('test.html', 'w', encoding='utf-8') as t:
|
||||||
|
t.write(content)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
crawler = Crawler()
|
||||||
|
data = crawler.crawl()
|
||||||
|
for r in data:
|
||||||
|
print(r)
|
Loading…
Reference in new issue