You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
|
|
|
|
"""
|
|
|
|
|
根据提供的关键词列表,爬取天水市人民政府网站上指定日期内与关键词相关的新闻的标题,并将其存储至数据库中。
|
|
|
|
|
|
|
|
|
|
考虑到相关因素,因此本代码只爬取前10页的新闻内容,即最多100条新闻作为测试。
|
|
|
|
|
|
|
|
|
|
此方法为多进程做法,即使用多进程并发爬取网页内容,再使用json提取新闻内容。
|
|
|
|
|
|
|
|
|
|
注意:本代码中的关键词列表默认为['灾害'],日期范围默认为2018年1月1日至2018年12月31日。
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
keywords: 用于搜索新闻的关键词列表
|
|
|
|
|
begin_date: 开始日期,用于搜索
|
|
|
|
|
end_date: 结束日期,用于搜索
|
|
|
|
|
size: 一次请求返回的新闻或政策的最大数量
|
|
|
|
|
|
|
|
|
|
Examples:
|
|
|
|
|
```
|
|
|
|
|
main(keywords=['灾害'],
|
|
|
|
|
begin_date='2018-01-01',
|
|
|
|
|
end_date='2018-12-31',
|
|
|
|
|
size=10)
|
|
|
|
|
```
|
|
|
|
|
"""
|
|
|
|
|
import util
|
|
|
|
|
import logging
|
|
|
|
|
from typing import List
|
|
|
|
|
import multiprocessing
|
|
|
|
|
|
|
|
|
|
import tqdm
|
|
|
|
|
|
|
|
|
|
lock = multiprocessing.Lock()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@util.timeit
|
|
|
|
|
def main(keywords: List[str], begin_date: str, end_date: str, size: int = 10):
|
|
|
|
|
"""
|
|
|
|
|
爬取与提供的关键词列表相关的新闻.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
keywords: 用于搜索新闻的关键词列表
|
|
|
|
|
begin_date: 开始日期,用于搜索
|
|
|
|
|
end_date: 结束日期,用于搜索
|
|
|
|
|
size: 一次请求返回的新闻或政策的最大数量
|
|
|
|
|
"""
|
|
|
|
|
logging.basicConfig(level=logging.INFO,
|
|
|
|
|
format="%(asctime)s - %(levelname)s - %(message)s",
|
|
|
|
|
datefmt="%Y-%m-%d %H:%M:%S",
|
|
|
|
|
filename='log.txt',
|
|
|
|
|
encoding='utf-8')
|
|
|
|
|
|
|
|
|
|
logging.info("开始运行普通做法")
|
|
|
|
|
|
|
|
|
|
spider = util.Spider(keywords=keywords,
|
|
|
|
|
begin_date=begin_date,
|
|
|
|
|
end_date=end_date,
|
|
|
|
|
size=size)
|
|
|
|
|
|
|
|
|
|
title_list = []
|
|
|
|
|
pbar = tqdm.tqdm(total=size * 10, desc='多进程爬取进度', unit='条', ncols=80)
|
|
|
|
|
|
|
|
|
|
with multiprocessing.Pool(processes=5) as pool:
|
|
|
|
|
results = []
|
|
|
|
|
for keyword in keywords:
|
|
|
|
|
for current in range(1, 11):
|
|
|
|
|
logging.info(f'keyword: {keyword}, current: {current}')
|
|
|
|
|
config = spider.get_config(keyword, current)
|
|
|
|
|
results.append(pool.apply_async(spider.fetch, (config, )))
|
|
|
|
|
|
|
|
|
|
for result in results:
|
|
|
|
|
data = result.get()
|
|
|
|
|
title_list += spider.parse(data)
|
|
|
|
|
|
|
|
|
|
lock.acquire()
|
|
|
|
|
pbar.update(size)
|
|
|
|
|
lock.release()
|
|
|
|
|
|
|
|
|
|
spider.save(title_list)
|
|
|
|
|
pbar.close()
|
|
|
|
|
logging.info("爬取完成")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
main(keywords=['灾害'],
|
|
|
|
|
begin_date='2018-01-01',
|
|
|
|
|
end_date='2018-12-31',
|
|
|
|
|
size=10)
|