|
|
|
@ -1,47 +1,30 @@
|
|
|
|
|
import csv
|
|
|
|
|
import random
|
|
|
|
|
import socket
|
|
|
|
|
import threading
|
|
|
|
|
from time import sleep
|
|
|
|
|
from typing import Optional
|
|
|
|
|
|
|
|
|
|
from msedge.selenium_tools import Edge
|
|
|
|
|
from msedge.selenium_tools import EdgeOptions
|
|
|
|
|
|
|
|
|
|
from conf.config import global_var, get_free_addresses, get_crawl_result, get_by_cookie, set_state_client
|
|
|
|
|
from dcs.tests.zhiwang import *
|
|
|
|
|
from dcs.tools.database import get_crawl_result_by_crawl_id, write_result2database
|
|
|
|
|
from dcs.tools.database import get_last_crawl_id, create_crawl_result_table
|
|
|
|
|
from dcs.tools.database import write_results2database
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def write2database(paper: Paper, table_name: str, last_crawl_id: int):
|
|
|
|
|
logger.info(f'[DATABASE] writing to database: {paper.title}')
|
|
|
|
|
def write2results(paper: Paper, results: list):
|
|
|
|
|
for author in paper.authors:
|
|
|
|
|
if author.name:
|
|
|
|
|
write_result2database([author.name, author.college, author.major, paper.title], table_name, last_crawl_id)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def write2csv(papers: list, file_name='./paper_author.csv'):
|
|
|
|
|
# 写入文件
|
|
|
|
|
f_papers_authors = open(file_name, 'w', encoding='utf-8', newline='')
|
|
|
|
|
writer_p_a = csv.writer(f_papers_authors) # 基于文件对象构建 csv写入对象
|
|
|
|
|
writer_p_a.writerow(["name", "college", "major", "paper"]) # csv文件的表头
|
|
|
|
|
|
|
|
|
|
# 读取每一篇论文
|
|
|
|
|
for paper in papers:
|
|
|
|
|
# 写入paper_author.csv文件
|
|
|
|
|
for author in paper.authors:
|
|
|
|
|
if author.name:
|
|
|
|
|
# print(author + " ")
|
|
|
|
|
writer_p_a.writerow([author.name, author.college, author.major, paper.title])
|
|
|
|
|
|
|
|
|
|
# 关闭文件
|
|
|
|
|
f_papers_authors.close()
|
|
|
|
|
results.append((author.name, author.college, author.major, paper.title))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Crawler(threading.Thread):
|
|
|
|
|
def __init__(self, partial_task: 'Spider_partial_task', last_crawl_id):
|
|
|
|
|
def __init__(self, partial_task: 'Spider_partial_task', last_crawl_id, results):
|
|
|
|
|
super(Crawler, self).__init__()
|
|
|
|
|
self.partial_task = partial_task
|
|
|
|
|
self.last_crawl_id = last_crawl_id
|
|
|
|
|
self.results = results
|
|
|
|
|
self.edge_driver_path = dict(global_var.configs.items('crawler'))['edge_driver_path']
|
|
|
|
|
|
|
|
|
|
def crawl_zhiwang(self, user_name=None):
|
|
|
|
@ -64,14 +47,14 @@ class Crawler(threading.Thread):
|
|
|
|
|
spider(driver, soup, papers)
|
|
|
|
|
self.partial_task.pages_start += 1
|
|
|
|
|
while paper_id < len(papers):
|
|
|
|
|
write2database(papers[paper_id], table_name=table_name, last_crawl_id=self.last_crawl_id)
|
|
|
|
|
write2results(papers[paper_id], results=self.results)
|
|
|
|
|
paper_id += 1
|
|
|
|
|
|
|
|
|
|
while self.partial_task.pages_start < self.partial_task.pages_end:
|
|
|
|
|
content = change_page(driver, self.partial_task.pages_start)
|
|
|
|
|
spider(driver, content, papers)
|
|
|
|
|
while paper_id < len(papers):
|
|
|
|
|
write2database(papers[paper_id], table_name=table_name, last_crawl_id=self.last_crawl_id)
|
|
|
|
|
write2results(papers[paper_id], results=self.results)
|
|
|
|
|
paper_id += 1
|
|
|
|
|
self.partial_task.pages_start += 1
|
|
|
|
|
driver.close()
|
|
|
|
@ -85,11 +68,12 @@ class Crawler(threading.Thread):
|
|
|
|
|
# 模拟爬取
|
|
|
|
|
logger.debug('simulation crawling...')
|
|
|
|
|
paper = Paper('test', [Author('test', 'test', 'test')])
|
|
|
|
|
write2database(paper, table_name=table_name, last_crawl_id=last_crawl_id)
|
|
|
|
|
write2database(paper, table_name=table_name, last_crawl_id=last_crawl_id)
|
|
|
|
|
write2database(paper, table_name=table_name, last_crawl_id=last_crawl_id)
|
|
|
|
|
write2results(paper, results=self.results)
|
|
|
|
|
write2results(paper, results=self.results)
|
|
|
|
|
write2results(paper, results=self.results)
|
|
|
|
|
|
|
|
|
|
# over
|
|
|
|
|
sleep(10)
|
|
|
|
|
self.partial_task.pages_start = self.partial_task.pages_end
|
|
|
|
|
|
|
|
|
|
def run(self) -> None:
|
|
|
|
@ -116,7 +100,11 @@ class Spider_partial_task:
|
|
|
|
|
self.crawl_id = None
|
|
|
|
|
|
|
|
|
|
def is_partial_task_crawl_completely(self):
|
|
|
|
|
return self.pages_start == self.pages_end
|
|
|
|
|
finished = (self.pages_start == self.pages_end)
|
|
|
|
|
if finished:
|
|
|
|
|
if self.task_type == 'local':
|
|
|
|
|
global_var.spider.crawlers -= 1
|
|
|
|
|
return finished
|
|
|
|
|
|
|
|
|
|
def __str__(self):
|
|
|
|
|
return f'{self.full_task.client_socket.getpeername(), self.request_map}'
|
|
|
|
@ -125,12 +113,14 @@ class Spider_partial_task:
|
|
|
|
|
class Spider_task(threading.Thread):
|
|
|
|
|
def __init__(self, client_socket: socket.socket, request_map: dict):
|
|
|
|
|
super().__init__()
|
|
|
|
|
self.free_remote_nodes = None
|
|
|
|
|
self.table_name = f'{Spider_partial_task(self, request_map).cui.user_name}_crawl_result'
|
|
|
|
|
self.last_crawl_id = get_last_crawl_id(table_name=self.table_name)
|
|
|
|
|
self.client_socket = client_socket
|
|
|
|
|
self.request_map = request_map
|
|
|
|
|
self.partial_tasks: list[Spider_partial_task] = []
|
|
|
|
|
self.const_page = 1
|
|
|
|
|
self.results = []
|
|
|
|
|
|
|
|
|
|
def distribute_task(self):
|
|
|
|
|
# distribute tasks, 3 pages as a task
|
|
|
|
@ -156,27 +146,17 @@ class Spider_task(threading.Thread):
|
|
|
|
|
|
|
|
|
|
def compose_result(self):
|
|
|
|
|
logger.info('[COMPOSE] composing task...')
|
|
|
|
|
result = dict()
|
|
|
|
|
logger.info(f'[RESULT] {self.results}')
|
|
|
|
|
remote_result = get_crawl_result(self.request_map['cookie'])
|
|
|
|
|
for result_map in list(remote_result):
|
|
|
|
|
result.update(result_map)
|
|
|
|
|
create_crawl_result_table(table_name=self.table_name)
|
|
|
|
|
for _, data in result_map.items():
|
|
|
|
|
write_result2database([data['name'], data['college'], data['major'], data['title']], self.table_name, self.last_crawl_id)
|
|
|
|
|
for task in self.partial_tasks:
|
|
|
|
|
if task.task_type == 'local':
|
|
|
|
|
local_result = dict()
|
|
|
|
|
local_result_database = get_crawl_result_by_crawl_id(
|
|
|
|
|
f"{get_by_cookie(task.request_map['cookie']).user_name}_crawl_result",
|
|
|
|
|
task.crawl_id)
|
|
|
|
|
initial_id = local_result_database[0][0]
|
|
|
|
|
for res in local_result_database:
|
|
|
|
|
local_result.update({res[0]-initial_id+1: {'name': res[1], 'college': res[2], 'major': res[3], 'paper': res[4]}})
|
|
|
|
|
logger.info(f'[RESULT] {local_result}')
|
|
|
|
|
result.update(local_result)
|
|
|
|
|
|
|
|
|
|
result.update({'crawl_id': self.last_crawl_id+1, 'table_name': self.table_name})
|
|
|
|
|
write2results(Paper(data['title'], [Author(data['name'], data['college'], data['major'])]), self.results)
|
|
|
|
|
|
|
|
|
|
logger.info(f'[DATABASE] writing crawl results to database...')
|
|
|
|
|
write_results2database(self.results, self.table_name, self.last_crawl_id)
|
|
|
|
|
|
|
|
|
|
result = {'crawl_id': self.last_crawl_id+1, 'table_name': self.table_name, 'data': self.results}
|
|
|
|
|
global_var.communicator.add_response('response', self.client_socket, result)
|
|
|
|
|
|
|
|
|
|
def run(self) -> None:
|
|
|
|
@ -184,31 +164,38 @@ class Spider_task(threading.Thread):
|
|
|
|
|
{'crawling state': 'starting, please wait...'})
|
|
|
|
|
self.distribute_task()
|
|
|
|
|
|
|
|
|
|
free_remote_nodes = list(get_free_addresses())
|
|
|
|
|
logger.info(f'[REMOTE] free nodes: {free_remote_nodes}')
|
|
|
|
|
while True:
|
|
|
|
|
self.free_remote_nodes = list(get_free_addresses())
|
|
|
|
|
random.shuffle(self.free_remote_nodes)
|
|
|
|
|
logger.info(f'[REMOTE] free nodes: {self.free_remote_nodes}')
|
|
|
|
|
for task in self.partial_tasks:
|
|
|
|
|
if task.is_partial_task_crawl_completely():
|
|
|
|
|
continue
|
|
|
|
|
else:
|
|
|
|
|
random.shuffle(free_remote_nodes)
|
|
|
|
|
current_task_thread = task.thread
|
|
|
|
|
if current_task_thread is None:
|
|
|
|
|
for f_node in free_remote_nodes:
|
|
|
|
|
for f_node in self.free_remote_nodes:
|
|
|
|
|
address = f_node
|
|
|
|
|
logger.info('[TASK] generating remote task')
|
|
|
|
|
logger.info(f'[TASK] generating remote task {task.request_map}')
|
|
|
|
|
task.thread = global_var.requester
|
|
|
|
|
task.task_type = 'remote'
|
|
|
|
|
global_var.requester.get(address, task)
|
|
|
|
|
free_remote_nodes.remove(f_node)
|
|
|
|
|
set_state_client(f_node, 'busy')
|
|
|
|
|
set_state_client('busy', address=f_node)
|
|
|
|
|
sleep(1)
|
|
|
|
|
self.free_remote_nodes.remove(f_node)
|
|
|
|
|
break
|
|
|
|
|
else:
|
|
|
|
|
logger.info('[TASK] generating local task')
|
|
|
|
|
crawler = Crawler(task, self.last_crawl_id)
|
|
|
|
|
task.thread = crawler
|
|
|
|
|
task.task_type = 'local'
|
|
|
|
|
crawler.start()
|
|
|
|
|
logger.info(f'[TASK] generating local task {task.request_map}')
|
|
|
|
|
if global_var.spider.crawlers >= global_var.spider.max_count_of_crawlers:
|
|
|
|
|
logger.warning(f'[TASK] generate failed, crawlers exceed! spider task {task.request_map} is at state waiting...')
|
|
|
|
|
break
|
|
|
|
|
else:
|
|
|
|
|
crawler = Crawler(task, self.last_crawl_id, self.results)
|
|
|
|
|
task.thread = crawler
|
|
|
|
|
task.task_type = 'local'
|
|
|
|
|
crawler.start()
|
|
|
|
|
global_var.spider.crawlers += 1
|
|
|
|
|
if self.is_all_task_crawled():
|
|
|
|
|
break
|
|
|
|
|
sleep(5) # 每5秒轮询一次
|
|
|
|
|
self.compose_result()
|
|
|
|
|