You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
86 lines
2.7 KiB
86 lines
2.7 KiB
import csv
|
|
import threading
|
|
|
|
import dcs.tests.config
|
|
from msedge.selenium_tools import Edge
|
|
from msedge.selenium_tools import EdgeOptions
|
|
|
|
from dcs.tests.zhiwang import *
|
|
from loguru import logger
|
|
|
|
|
|
def translate(word):
|
|
url = 'http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule'
|
|
data = {'i': word,
|
|
'from': 'AUTO',
|
|
'to': 'AUTO',
|
|
'smartresult': 'dict',
|
|
'client': 'fanyideskweb',
|
|
'doctype': 'json',
|
|
'version': '2.1',
|
|
'keyfrom': 'fanyi.web',
|
|
'action': 'FY_BY_REALTIME',
|
|
'typoResult': 'false'}
|
|
r = requests.post(url, data)
|
|
answer = r.json()
|
|
result = answer['translateResult'][0][0]['tgt']
|
|
return result
|
|
|
|
|
|
def crawl_zhiwang(word, pages_start=1, pages_end=2):
|
|
edge_options = EdgeOptions()
|
|
edge_options.use_chromium = True
|
|
edge_options.add_argument('headless')
|
|
driver = Edge(options=edge_options, executable_path=r'G:\Users\god\PycharmProjects\dcs\bin\msedgedriver.exe')
|
|
soup = driver_open(driver, word) # 搜索word
|
|
papers = [] # 用于保存爬取到的论文
|
|
# 爬取第一篇
|
|
if pages_start == 1:
|
|
spider(driver, soup, papers)
|
|
pages_start += 1
|
|
for pn in range(pages_start, pages_end):
|
|
content = change_page(driver, pn)
|
|
spider(driver, content, papers)
|
|
driver.close()
|
|
# TODO 写入数据库
|
|
|
|
|
|
def write2csv(papers: list, file_name='./paper_author.csv'):
|
|
# 写入文件
|
|
f_papers_authors = open(file_name, 'w', encoding='utf-8', newline='')
|
|
writer_p_a = csv.writer(f_papers_authors) # 基于文件对象构建 csv写入对象
|
|
writer_p_a.writerow(["name", "college", "major", "paper"]) # csv文件的表头
|
|
|
|
# 读取每一篇论文
|
|
for paper in papers:
|
|
# 写入paper_author.csv文件
|
|
for author in paper.authors:
|
|
if author.name:
|
|
# print(author + " ")
|
|
writer_p_a.writerow([author.name, author.college, author.major, paper.title])
|
|
|
|
# 关闭文件
|
|
f_papers_authors.close()
|
|
|
|
|
|
class Spider(threading.Thread):
|
|
def __init__(self, word: str, pages_start=1, pages_end=1):
|
|
super().__init__()
|
|
self.word = word
|
|
self.daemon = True
|
|
self.pages_start = pages_start
|
|
self.pages_end = pages_end
|
|
pass
|
|
|
|
def distribute_spiders(self):
|
|
free_spiders = dcs.tests.config.get_free_spiders()
|
|
for sp in free_spiders:
|
|
pass
|
|
print(self.pages_start, sp)
|
|
# TODO 发布任务
|
|
|
|
def run(self) -> None:
|
|
logger.info('crawling...')
|
|
self.distribute_spiders()
|
|
crawl_zhiwang(word=self.word, pages_start=self.pages_start, pages_end=self.pages_end)
|