forked from p3t2ja9zs/dcs
parent
ad427ef9bc
commit
60f93c0f0e
@ -0,0 +1,103 @@
|
||||
import threading
|
||||
from collections import deque
|
||||
|
||||
import bs4
|
||||
|
||||
from dcs.tests.zhiwang import *
|
||||
|
||||
|
||||
class Crawler:
|
||||
def __init__(self):
|
||||
self.url = 'https://kns.cnki.net/kns8/Brief/GetGridTableHtml'
|
||||
self.url = 'https://kns.cnki.net/kns8/Group/Result'
|
||||
self.url = 'https://wap.cnki.net/touch/web/Article/Search'
|
||||
self.headers = {
|
||||
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36 Edg/103.0.1264.49'}
|
||||
self.cookies = 'Ecp_ClientId=1220704081600243712; Ecp_loginuserbk=wh0302; knsLeftGroupSelectItem=1;2;; Ecp_ClientIp=202.197.9.22; SID_sug=126002; _pk_ses=*; Ecp_IpLoginFail=220708116.162.2.165; _pk_id=6f3fe3b8-fcc4-4111-ad5f-c2f8aba3a0e3.1656893782.2.1657209667.1657209661.; ASP.NET_SessionId=uppv4o3sgpf45j1lmsc4ogin; SID_kns8=015123157; dblang=ch'
|
||||
|
||||
def get_search_html(self):
|
||||
params = {
|
||||
'searchtype': 0,
|
||||
'fieldtype': 101,
|
||||
'keyword': 'computer'
|
||||
}
|
||||
|
||||
res = requests.post(self.url, data=params, headers=self.headers)
|
||||
if res.status_code == 200:
|
||||
soup = bs4.BeautifulSoup(res.text, 'html.parser')
|
||||
return soup
|
||||
|
||||
def get_html_by_link(self, link):
|
||||
logger.debug(link)
|
||||
res = requests.get('https:' + link, headers=self.headers)
|
||||
if res.status_code == 200:
|
||||
soup = bs4.BeautifulSoup(res.text, 'html.parser')
|
||||
return soup
|
||||
|
||||
@staticmethod
|
||||
def get_paper_links(soup: bs4.BeautifulSoup):
|
||||
links = soup.find_all('a', class_='c-company-top-link')
|
||||
links_list = []
|
||||
for i in links:
|
||||
if i == 'javascript:void(0);':
|
||||
continue
|
||||
links_list.append(i.attrs['href'])
|
||||
return links_list
|
||||
|
||||
def parse_paper_html(self, soup: bs4.BeautifulSoup, res):
|
||||
title = soup.find('div', class_='c-card__title2').text.strip()
|
||||
|
||||
authors = soup.find('div', class_='c-card__author')
|
||||
if '\x20' in authors.text:
|
||||
res.append(Paper(title, [Author(None, None, None)]))
|
||||
return
|
||||
authors_links = [i.attrs['href'] for i in authors.find_all('a')]
|
||||
authors_list = []
|
||||
for i in authors_links:
|
||||
if i == 'javascript:void(0);':
|
||||
continue
|
||||
authors_list.append(self.parse_author_html(self.get_html_by_link(i)))
|
||||
|
||||
res.append(Paper(title, authors))
|
||||
|
||||
@staticmethod
|
||||
def parse_author_html(soup: bs4.BeautifulSoup):
|
||||
try:
|
||||
name = soup.find('div', class_='zz-middle-name-text').text.strip()
|
||||
college = soup.find('div', class_='zz-middle-company').text.strip()
|
||||
major = soup.find('div', class_='zz-info-chart').find('span').text.strip()
|
||||
# print(name)
|
||||
# print(college)
|
||||
# print(major)
|
||||
except AttributeError:
|
||||
name = soup.find('div', class_='c-nav__item c-nav__title').text.strip()
|
||||
college = None
|
||||
major = None
|
||||
return Author(name, college, major)
|
||||
|
||||
def crawl(self):
|
||||
sh = self.get_search_html()
|
||||
pl = self.get_paper_links(sh)
|
||||
res = deque()
|
||||
threads = []
|
||||
for p in pl:
|
||||
if p == 'javascript:void(0);':
|
||||
continue
|
||||
p = self.get_html_by_link(p)
|
||||
t = threading.Thread(target=self.parse_paper_html, args=(p, res,))
|
||||
threads.append(t)
|
||||
[t.start() for t in threads]
|
||||
[t.join() for t in threads]
|
||||
return res
|
||||
|
||||
@staticmethod
|
||||
def write2test(content):
|
||||
with open('test.html', 'w', encoding='utf-8') as t:
|
||||
t.write(content)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
crawler = Crawler()
|
||||
data = crawler.crawl()
|
||||
for r in data:
|
||||
print(r)
|
Loading…
Reference in new issue