You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
|
|
|
|
"""
|
|
|
|
|
根据提供的关键词列表,爬取天水市人民政府网站上指定日期内与关键词相关的新闻的标题,并将其存储至数据库中。
|
|
|
|
|
|
|
|
|
|
考虑到相关因素,因此本代码只爬取前10页的新闻内容,即最多100条新闻作为测试。
|
|
|
|
|
|
|
|
|
|
此方法为多线程做法,即使用异步并行爬取网页内容,再使用json提取新闻内容。
|
|
|
|
|
|
|
|
|
|
注意:本代码中的关键词列表默认为['灾害'],日期范围默认为2018年1月1日至2018年12月31日。
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
keywords: 用于搜索新闻的关键词列表
|
|
|
|
|
begin_date: 开始日期,用于搜索
|
|
|
|
|
end_date: 结束日期,用于搜索
|
|
|
|
|
size: 一次请求返回的新闻或政策的最大数量
|
|
|
|
|
|
|
|
|
|
Examples:
|
|
|
|
|
```
|
|
|
|
|
asyncio.run(
|
|
|
|
|
main_async(keywords=['灾害'],
|
|
|
|
|
begin_date='2018-01-01',
|
|
|
|
|
end_date='2018-12-31',
|
|
|
|
|
size=10))
|
|
|
|
|
```
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
import asyncio
|
|
|
|
|
import util
|
|
|
|
|
import logging
|
|
|
|
|
from typing import List
|
|
|
|
|
import tqdm
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@util.timeit_async
|
|
|
|
|
async def main_async(keywords: List[str],
|
|
|
|
|
begin_date: str,
|
|
|
|
|
end_date: str,
|
|
|
|
|
size: int = 10):
|
|
|
|
|
"""
|
|
|
|
|
使用异步方式爬取与提供的关键词列表相关的新闻.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
keywords: 用于搜索新闻的关键词列表
|
|
|
|
|
begin_date: 开始日期,用于搜索
|
|
|
|
|
end_date: 结束日期,用于搜索
|
|
|
|
|
size: 一次请求返回的新闻或政策的最大数量
|
|
|
|
|
"""
|
|
|
|
|
logging.basicConfig(level=logging.INFO,
|
|
|
|
|
format="%(asctime)s - %(levelname)s - %(message)s",
|
|
|
|
|
datefmt="%Y-%m-%d %H:%M:%S",
|
|
|
|
|
filename='log.txt',
|
|
|
|
|
encoding='utf-8')
|
|
|
|
|
|
|
|
|
|
logging.info("开始运行异步爬取")
|
|
|
|
|
|
|
|
|
|
spider = util.Spider(keywords=keywords,
|
|
|
|
|
begin_date=begin_date,
|
|
|
|
|
end_date=end_date,
|
|
|
|
|
size=size)
|
|
|
|
|
|
|
|
|
|
pbar = tqdm.tqdm(total=size * 10, desc='异步爬取进度', unit='条', ncols=80)
|
|
|
|
|
title_list = []
|
|
|
|
|
tasks = []
|
|
|
|
|
for keyword in keywords:
|
|
|
|
|
for current in range(1, 11):
|
|
|
|
|
logging.info(f'keyword: {keyword}, current: {current}')
|
|
|
|
|
config = spider.get_config(keyword, current)
|
|
|
|
|
task = asyncio.create_task(spider.fetch_async(config))
|
|
|
|
|
tasks.append(task)
|
|
|
|
|
|
|
|
|
|
for task in asyncio.as_completed(tasks):
|
|
|
|
|
data = await task
|
|
|
|
|
title_list += spider.parse(data)
|
|
|
|
|
pbar.update(size)
|
|
|
|
|
|
|
|
|
|
spider.save(title_list)
|
|
|
|
|
pbar.close()
|
|
|
|
|
logging.info("爬取完成")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
asyncio.run(
|
|
|
|
|
main_async(keywords=['灾害'],
|
|
|
|
|
begin_date='2018-01-01',
|
|
|
|
|
end_date='2018-12-31',
|
|
|
|
|
size=10))
|