ADD file via upload

main
pfc8hp2r6 11 months ago
parent 47ca70a518
commit b9ca767c79

@ -0,0 +1,156 @@
import requests
from urllib.parse import urlencode
from pyquery import PyQuery as pq
import time
import xlwt
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import Counter
# 设置代理等新浪微博的数据是用ajax异步下拉加载的network->xhr
host = 'm.weibo.cn'
base_url = f'https://{host}/api/container/getIndex?'
# 设置请求头
headers = {
'Host': host,
'Referer': 'https://m.weibo.cn/search?containerid=231522type%3D1%26q%3D%23%E7%BE%8E%E5%9B%BD%E7%96%AB%E6%83%85%23',
'User-Agent': 'Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Mobile Safari/537.36'
}
# 按页数抓取数据
def get_single_page(page):
"""
获取单个页面的数据
:param page: 页面编号
:return: JSON 数据
"""
params = {
'containerid': '231522type=1&q=#人工智能#',
'page_type': 'searchall',
'page': page
}
url = base_url + urlencode(params)
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.json()
except requests.ConnectionError as e:
print('抓取错误:', e.args)
# 解析页面返回的JSON数据
count = 0
def parse_page(json_data):
"""
解析页面返回的JSON数据
:param json_data: JSON 数据
:return: 解析后的数据
"""
global count
items = json_data.get('data').get('cards')
for item in items:
mblog = item.get('mblog')
if mblog:
data = {
'id': mblog.get('id'),
'created': mblog.get('created_at'),
'text': pq(mblog.get('text')).text() # 提取内容中的文本
}
yield data
count += 1
def segment_text(text):
"""
对文本进行分词处理
:param text: 输入文本
:return: 分词后的列表
"""
seg_list = jieba.cut(text.strip())
return list(seg_list)
def generate_wordcloud(words):
"""
生成词云图
:param words: 分词后的单词列表
"""
wordcloud = WordCloud(
font_path='simhei.ttf', # 指定字体路径
background_color='white',
width=800,
height=600
).generate(' '.join(words))
# 显示词云图
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()
# 保存词云图
wordcloud.to_file('fjt.png')
def predict_event_probability(words, event_keywords):
"""
根据关键词预测事件发生的概率
:param words: 分词后的单词列表
:param event_keywords: 事件关键词列表
:return: 事件发生的概率
"""
word_count = Counter(words)
total_words = sum(word_count.values())
event_word_count = sum(word_count[word] for word in event_keywords if word in word_count)
probability = event_word_count / total_words if total_words > 0 else 0
return probability
if __name__ == '__main__':
workbook = xlwt.Workbook(encoding='utf-8')
worksheet = workbook.add_sheet('人工智能')
# 设置列标题
worksheet.write(0, 0, '创建时间')
worksheet.write(0, 1, '文本')
worksheet.write(0, 2, '分词结果')
worksheet.write(0, 3, '事件概率')
row = 1
all_words = []
event_keywords = ['人工智能']
for page in range(1, 5):
json_data = get_single_page(page)
results = parse_page(json_data)
tmp_list = []
for result in results:
created = result.get('created').strip('\n')
text = result.get('text').strip('\n')
segmented_text = segment_text(text)
# 写入创建时间和文本
worksheet.write(row, 0, label=created)
worksheet.write(row, 1, label=text)
# 写入分词结果
for idx, word in enumerate(segmented_text):
worksheet.write(row + idx, 2, label=word)
all_words.extend(segmented_text) # 添加分词结果到列表
# 计算事件概率
event_probability = predict_event_probability(segmented_text, event_keywords)
worksheet.write(row, 3, label=event_probability)
row += len(segmented_text) # 更新行号
time.sleep(1) # 爬取时间间隔
workbook.save('fjt.xls')
# 生成词云图
generate_wordcloud(all_words)
Loading…
Cancel
Save