|
|
|
@ -0,0 +1,156 @@
|
|
|
|
|
import requests
|
|
|
|
|
from urllib.parse import urlencode
|
|
|
|
|
from pyquery import PyQuery as pq
|
|
|
|
|
import time
|
|
|
|
|
import xlwt
|
|
|
|
|
import jieba
|
|
|
|
|
from wordcloud import WordCloud
|
|
|
|
|
import matplotlib.pyplot as plt
|
|
|
|
|
from collections import Counter
|
|
|
|
|
|
|
|
|
|
# 设置代理等(新浪微博的数据是用ajax异步下拉加载的,network->xhr)
|
|
|
|
|
host = 'm.weibo.cn'
|
|
|
|
|
base_url = f'https://{host}/api/container/getIndex?'
|
|
|
|
|
|
|
|
|
|
# 设置请求头
|
|
|
|
|
headers = {
|
|
|
|
|
'Host': host,
|
|
|
|
|
'Referer': 'https://m.weibo.cn/search?containerid=231522type%3D1%26q%3D%23%E7%BE%8E%E5%9B%BD%E7%96%AB%E6%83%85%23',
|
|
|
|
|
'User-Agent': 'Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Mobile Safari/537.36'
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# 按页数抓取数据
|
|
|
|
|
def get_single_page(page):
|
|
|
|
|
"""
|
|
|
|
|
获取单个页面的数据。
|
|
|
|
|
|
|
|
|
|
:param page: 页面编号
|
|
|
|
|
:return: JSON 数据
|
|
|
|
|
"""
|
|
|
|
|
params = {
|
|
|
|
|
'containerid': '231522type=1&q=#人工智能#',
|
|
|
|
|
'page_type': 'searchall',
|
|
|
|
|
'page': page
|
|
|
|
|
}
|
|
|
|
|
url = base_url + urlencode(params)
|
|
|
|
|
try:
|
|
|
|
|
response = requests.get(url, headers=headers)
|
|
|
|
|
if response.status_code == 200:
|
|
|
|
|
return response.json()
|
|
|
|
|
except requests.ConnectionError as e:
|
|
|
|
|
print('抓取错误:', e.args)
|
|
|
|
|
|
|
|
|
|
# 解析页面返回的JSON数据
|
|
|
|
|
count = 0
|
|
|
|
|
|
|
|
|
|
def parse_page(json_data):
|
|
|
|
|
"""
|
|
|
|
|
解析页面返回的JSON数据。
|
|
|
|
|
|
|
|
|
|
:param json_data: JSON 数据
|
|
|
|
|
:return: 解析后的数据
|
|
|
|
|
"""
|
|
|
|
|
global count
|
|
|
|
|
items = json_data.get('data').get('cards')
|
|
|
|
|
for item in items:
|
|
|
|
|
mblog = item.get('mblog')
|
|
|
|
|
if mblog:
|
|
|
|
|
data = {
|
|
|
|
|
'id': mblog.get('id'),
|
|
|
|
|
'created': mblog.get('created_at'),
|
|
|
|
|
'text': pq(mblog.get('text')).text() # 提取内容中的文本
|
|
|
|
|
}
|
|
|
|
|
yield data
|
|
|
|
|
count += 1
|
|
|
|
|
|
|
|
|
|
def segment_text(text):
|
|
|
|
|
"""
|
|
|
|
|
对文本进行分词处理。
|
|
|
|
|
|
|
|
|
|
:param text: 输入文本
|
|
|
|
|
:return: 分词后的列表
|
|
|
|
|
"""
|
|
|
|
|
seg_list = jieba.cut(text.strip())
|
|
|
|
|
return list(seg_list)
|
|
|
|
|
|
|
|
|
|
def generate_wordcloud(words):
|
|
|
|
|
"""
|
|
|
|
|
生成词云图。
|
|
|
|
|
|
|
|
|
|
:param words: 分词后的单词列表
|
|
|
|
|
"""
|
|
|
|
|
wordcloud = WordCloud(
|
|
|
|
|
font_path='simhei.ttf', # 指定字体路径
|
|
|
|
|
background_color='white',
|
|
|
|
|
width=800,
|
|
|
|
|
height=600
|
|
|
|
|
).generate(' '.join(words))
|
|
|
|
|
|
|
|
|
|
# 显示词云图
|
|
|
|
|
plt.imshow(wordcloud, interpolation='bilinear')
|
|
|
|
|
plt.axis('off')
|
|
|
|
|
plt.show()
|
|
|
|
|
|
|
|
|
|
# 保存词云图
|
|
|
|
|
wordcloud.to_file('fjt.png')
|
|
|
|
|
|
|
|
|
|
def predict_event_probability(words, event_keywords):
|
|
|
|
|
"""
|
|
|
|
|
根据关键词预测事件发生的概率。
|
|
|
|
|
|
|
|
|
|
:param words: 分词后的单词列表
|
|
|
|
|
:param event_keywords: 事件关键词列表
|
|
|
|
|
:return: 事件发生的概率
|
|
|
|
|
"""
|
|
|
|
|
word_count = Counter(words)
|
|
|
|
|
total_words = sum(word_count.values())
|
|
|
|
|
event_word_count = sum(word_count[word] for word in event_keywords if word in word_count)
|
|
|
|
|
probability = event_word_count / total_words if total_words > 0 else 0
|
|
|
|
|
return probability
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
workbook = xlwt.Workbook(encoding='utf-8')
|
|
|
|
|
worksheet = workbook.add_sheet('人工智能')
|
|
|
|
|
|
|
|
|
|
# 设置列标题
|
|
|
|
|
worksheet.write(0, 0, '创建时间')
|
|
|
|
|
worksheet.write(0, 1, '文本')
|
|
|
|
|
worksheet.write(0, 2, '分词结果')
|
|
|
|
|
worksheet.write(0, 3, '事件概率')
|
|
|
|
|
|
|
|
|
|
row = 1
|
|
|
|
|
|
|
|
|
|
all_words = []
|
|
|
|
|
event_keywords = ['人工智能']
|
|
|
|
|
|
|
|
|
|
for page in range(1, 5):
|
|
|
|
|
json_data = get_single_page(page)
|
|
|
|
|
results = parse_page(json_data)
|
|
|
|
|
tmp_list = []
|
|
|
|
|
|
|
|
|
|
for result in results:
|
|
|
|
|
created = result.get('created').strip('\n')
|
|
|
|
|
text = result.get('text').strip('\n')
|
|
|
|
|
segmented_text = segment_text(text)
|
|
|
|
|
|
|
|
|
|
# 写入创建时间和文本
|
|
|
|
|
worksheet.write(row, 0, label=created)
|
|
|
|
|
worksheet.write(row, 1, label=text)
|
|
|
|
|
|
|
|
|
|
# 写入分词结果
|
|
|
|
|
for idx, word in enumerate(segmented_text):
|
|
|
|
|
worksheet.write(row + idx, 2, label=word)
|
|
|
|
|
|
|
|
|
|
all_words.extend(segmented_text) # 添加分词结果到列表
|
|
|
|
|
|
|
|
|
|
# 计算事件概率
|
|
|
|
|
event_probability = predict_event_probability(segmented_text, event_keywords)
|
|
|
|
|
worksheet.write(row, 3, label=event_probability)
|
|
|
|
|
|
|
|
|
|
row += len(segmented_text) # 更新行号
|
|
|
|
|
|
|
|
|
|
time.sleep(1) # 爬取时间间隔
|
|
|
|
|
workbook.save('fjt.xls')
|
|
|
|
|
|
|
|
|
|
# 生成词云图
|
|
|
|
|
generate_wordcloud(all_words)
|