You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

189 lines
4.9 KiB

"""
8 months ago
"""
import re
import time
import functools
import json
import asyncio
import requests
from typing import Any, Dict, List
class Spider:
"""
爬虫类
Args:
keywords (List[str]): 用于搜索新闻的关键词列表
begin_date (str): 开始日期用于搜索
end_date (str): 结束日期用于搜索
size (int): 一次请求返回的新闻或政策的最大数量
Attributes:
URL (str): 网址
"""
# 天水市人民政府网站
URL = ('https://www.tianshui.gov.cn/aop_component/'
'/webber/search/search/search/queryPage')
def __init__(self, keywords: List[str], begin_date: str, end_date: str,
size: int):
self.keywords = keywords
self.begin_date = begin_date
self.end_date = end_date
self.size = size
def get_config(self, keyword: str, current: int) -> Dict[str, Any]:
"""
获取配置信息
Args:
keyword (str): 关键词
size (int): 一次请求返回的新闻的最大数量
Returns:
Dict[str, Any]: 配置信息
"""
return {
"aliasName": "article_data,open_data,mailbox_data,article_file",
"keyWord": keyword,
"lastkeyWord": keyword,
"searchKeyWord": False,
"orderType": "score",
"searchType": "text",
"searchScope": "3",
"searchOperator": 0,
"searchDateType": "custom",
"searchDateName": f"{self.begin_date}-{self.end_date}",
"beginDate": self.begin_date,
"endDate": self.end_date,
"showId": "c2ee13065aae85d7a998b8a3cd645961",
"auditing": ["1"],
"owner": "1912126876",
"token": "tourist",
"urlPrefix": "/aop_component/",
"page": {
"current": current,
"size": self.size,
"pageSizes": [2, 5, 10, 20, 50, 100],
"total": 0,
"totalPage": 0,
"indexs": []
},
"advance": False,
"advanceKeyWord": "",
"lang": "i18n_zh_CN"
}
def generate_headers(self) -> dict:
"""
生成请求头
Returns:
dict: 请求头
"""
return {
'Authorization':
'tourist',
'User-Agent':
('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit'
'/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari'
'/537.36 Edg/124.0.0.0')
}
def fetch(self, config: Dict[str, Any]) -> Dict[str, Any]:
"""
普通做法
Post请求获取网页内容并返回请求结果
Args:
config (Dict[str, Any]): 配置信息
Returns:
Dict[str, Any]: 请求结果
"""
response = requests.post(self.URL,
headers=self.generate_headers(),
json=config).text
time.sleep(3)
return json.loads(response)
async def fetch_async(self, config: Dict[str, Any]) -> Dict[str, Any]:
"""
异步做法
Post请求获取网页内容并返回请求结果
Args:
config (Dict[str, Any]): 配置信息
Returns:
Dict[str, Any]: 请求结果
"""
response = requests.post(self.URL,
headers=self.generate_headers(),
json=config).text
await asyncio.sleep(3)
return json.loads(response)
def parse(self, data: Dict[str, Any]) -> List[str]:
"""
解析网页内容
Args:
data (Dict[str, Any]): 网页内容
Returns:
List[str]: 标题列表
"""
title_list = []
records = data['data']['page']['records']
for i in range(self.size):
title = records[i]['title']
title = re.sub('<[^>]*>', '', title) # 去除html标签
title_list.append(title)
# print(title)
return title_list
def save(self, title_list: List[str]):
"""
保存数据
"""
pass
# 时间装饰器
def timeit(func):
"""
计算函数运行时间
Args:
func: 函数
Return:
函数
"""
def wrapper(*args, **kwargs):
start = time.time()
result = func(*args, **kwargs)
print(f'{func.__name__} cost: {time.time() - start}')
return result
return wrapper
def timeit_async(func):
@functools.wraps(func)
async def wrapper(*args, **kwargs):
start = time.time()
result = await func(*args, **kwargs)
print(f'{func.__name__} cost: {time.time() - start}')
return result
return wrapper