You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

157 lines
4.5 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import requests
from urllib.parse import urlencode
from pyquery import PyQuery as pq
import time
import xlwt
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import Counter
# 设置代理等新浪微博的数据是用ajax异步下拉加载的network->xhr
host = 'm.weibo.cn'
base_url = f'https://{host}/api/container/getIndex?'
# 设置请求头
headers = {
'Host': host,
'Referer': 'https://m.weibo.cn/search?containerid=231522type%3D1%26q%3D%23%E7%BE%8E%E5%9B%BD%E7%96%AB%E6%83%85%23',
'User-Agent': 'Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Mobile Safari/537.36'
}
# 按页数抓取数据
def get_single_page(page):
"""
获取单个页面的数据。
:param page: 页面编号
:return: JSON 数据
"""
params = {
'containerid': '231522type=1&q=#人工智能#',
'page_type': 'searchall',
'page': page
}
url = base_url + urlencode(params)
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.json()
except requests.ConnectionError as e:
print('抓取错误:', e.args)
# 解析页面返回的JSON数据
count = 0
def parse_page(json_data):
"""
解析页面返回的JSON数据。
:param json_data: JSON 数据
:return: 解析后的数据
"""
global count
items = json_data.get('data').get('cards')
for item in items:
mblog = item.get('mblog')
if mblog:
data = {
'id': mblog.get('id'),
'created': mblog.get('created_at'),
'text': pq(mblog.get('text')).text() # 提取内容中的文本
}
yield data
count += 1
def segment_text(text):
"""
对文本进行分词处理。
:param text: 输入文本
:return: 分词后的列表
"""
seg_list = jieba.cut(text.strip())
return list(seg_list)
def generate_wordcloud(words):
"""
生成词云图。
:param words: 分词后的单词列表
"""
wordcloud = WordCloud(
font_path='simhei.ttf', # 指定字体路径
background_color='white',
width=800,
height=600
).generate(' '.join(words))
# 显示词云图
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()
# 保存词云图
wordcloud.to_file('fjt.png')
def predict_event_probability(words, event_keywords):
"""
根据关键词预测事件发生的概率。
:param words: 分词后的单词列表
:param event_keywords: 事件关键词列表
:return: 事件发生的概率
"""
word_count = Counter(words)
total_words = sum(word_count.values())
event_word_count = sum(word_count[word] for word in event_keywords if word in word_count)
probability = event_word_count / total_words if total_words > 0 else 0
return probability
if __name__ == '__main__':
workbook = xlwt.Workbook(encoding='utf-8')
worksheet = workbook.add_sheet('人工智能')
# 设置列标题
worksheet.write(0, 0, '创建时间')
worksheet.write(0, 1, '文本')
worksheet.write(0, 2, '分词结果')
worksheet.write(0, 3, '事件概率')
row = 1
all_words = []
event_keywords = ['人工智能']
for page in range(1, 5):
json_data = get_single_page(page)
results = parse_page(json_data)
tmp_list = []
for result in results:
created = result.get('created').strip('\n')
text = result.get('text').strip('\n')
segmented_text = segment_text(text)
# 写入创建时间和文本
worksheet.write(row, 0, label=created)
worksheet.write(row, 1, label=text)
# 写入分词结果
for idx, word in enumerate(segmented_text):
worksheet.write(row + idx, 2, label=word)
all_words.extend(segmented_text) # 添加分词结果到列表
# 计算事件概率
event_probability = predict_event_probability(segmented_text, event_keywords)
worksheet.write(row, 3, label=event_probability)
row += len(segmented_text) # 更新行号
time.sleep(1) # 爬取时间间隔
workbook.save('fjt.xls')
# 生成词云图
generate_wordcloud(all_words)