You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
dcs/dcs/tests/spider.py

86 lines
2.7 KiB

import csv
import threading
import dcs.tests.config
from msedge.selenium_tools import Edge
from msedge.selenium_tools import EdgeOptions
from dcs.tests.zhiwang import *
from loguru import logger
def translate(word):
url = 'http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule'
data = {'i': word,
'from': 'AUTO',
'to': 'AUTO',
'smartresult': 'dict',
'client': 'fanyideskweb',
'doctype': 'json',
'version': '2.1',
'keyfrom': 'fanyi.web',
'action': 'FY_BY_REALTIME',
'typoResult': 'false'}
r = requests.post(url, data)
answer = r.json()
result = answer['translateResult'][0][0]['tgt']
return result
def crawl_zhiwang(word, pages_start=1, pages_end=2):
edge_options = EdgeOptions()
edge_options.use_chromium = True
edge_options.add_argument('headless')
driver = Edge(options=edge_options, executable_path=r'G:\Users\god\PycharmProjects\dcs\bin\msedgedriver.exe')
soup = driver_open(driver, word) # 搜索word
papers = [] # 用于保存爬取到的论文
# 爬取第一篇
if pages_start == 1:
spider(driver, soup, papers)
pages_start += 1
for pn in range(pages_start, pages_end):
content = change_page(driver, pn)
spider(driver, content, papers)
driver.close()
# TODO 写入数据库
def write2csv(papers: list, file_name='./paper_author.csv'):
# 写入文件
f_papers_authors = open(file_name, 'w', encoding='utf-8', newline='')
writer_p_a = csv.writer(f_papers_authors) # 基于文件对象构建 csv写入对象
writer_p_a.writerow(["name", "college", "major", "paper"]) # csv文件的表头
# 读取每一篇论文
for paper in papers:
# 写入paper_author.csv文件
for author in paper.authors:
if author.name:
# print(author + " ")
writer_p_a.writerow([author.name, author.college, author.major, paper.title])
# 关闭文件
f_papers_authors.close()
class Spider(threading.Thread):
def __init__(self, word: str, pages_start=1, pages_end=1):
super().__init__()
self.word = word
self.daemon = True
self.pages_start = pages_start
self.pages_end = pages_end
pass
def distribute_spiders(self):
free_spiders = dcs.tests.config.get_free_spiders()
for sp in free_spiders:
pass
print(self.pages_start, sp)
# TODO 发布任务
def run(self) -> None:
logger.info('crawling...')
self.distribute_spiders()
crawl_zhiwang(word=self.word, pages_start=self.pages_start, pages_end=self.pages_end)