From b9ca767c7956e0a760cc46a9690b72b7641bc93c Mon Sep 17 00:00:00 2001 From: pfc8hp2r6 <2317678682@qq.com> Date: Mon, 16 Sep 2024 18:21:11 +0800 Subject: [PATCH] ADD file via upload --- fujia.py | 156 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 156 insertions(+) create mode 100644 fujia.py diff --git a/fujia.py b/fujia.py new file mode 100644 index 0000000..e689eed --- /dev/null +++ b/fujia.py @@ -0,0 +1,156 @@ +import requests +from urllib.parse import urlencode +from pyquery import PyQuery as pq +import time +import xlwt +import jieba +from wordcloud import WordCloud +import matplotlib.pyplot as plt +from collections import Counter + +# 设置代理等(新浪微博的数据是用ajax异步下拉加载的,network->xhr) +host = 'm.weibo.cn' +base_url = f'https://{host}/api/container/getIndex?' + +# 设置请求头 +headers = { + 'Host': host, + 'Referer': 'https://m.weibo.cn/search?containerid=231522type%3D1%26q%3D%23%E7%BE%8E%E5%9B%BD%E7%96%AB%E6%83%85%23', + 'User-Agent': 'Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Mobile Safari/537.36' +} + +# 按页数抓取数据 +def get_single_page(page): + """ + 获取单个页面的数据。 + + :param page: 页面编号 + :return: JSON 数据 + """ + params = { + 'containerid': '231522type=1&q=#人工智能#', + 'page_type': 'searchall', + 'page': page + } + url = base_url + urlencode(params) + try: + response = requests.get(url, headers=headers) + if response.status_code == 200: + return response.json() + except requests.ConnectionError as e: + print('抓取错误:', e.args) + +# 解析页面返回的JSON数据 +count = 0 + +def parse_page(json_data): + """ + 解析页面返回的JSON数据。 + + :param json_data: JSON 数据 + :return: 解析后的数据 + """ + global count + items = json_data.get('data').get('cards') + for item in items: + mblog = item.get('mblog') + if mblog: + data = { + 'id': mblog.get('id'), + 'created': mblog.get('created_at'), + 'text': pq(mblog.get('text')).text() # 提取内容中的文本 + } + yield data + count += 1 + +def segment_text(text): + """ + 对文本进行分词处理。 + + :param text: 输入文本 + :return: 分词后的列表 + """ + seg_list = jieba.cut(text.strip()) + return list(seg_list) + +def generate_wordcloud(words): + """ + 生成词云图。 + + :param words: 分词后的单词列表 + """ + wordcloud = WordCloud( + font_path='simhei.ttf', # 指定字体路径 + background_color='white', + width=800, + height=600 + ).generate(' '.join(words)) + + # 显示词云图 + plt.imshow(wordcloud, interpolation='bilinear') + plt.axis('off') + plt.show() + + # 保存词云图 + wordcloud.to_file('fjt.png') + +def predict_event_probability(words, event_keywords): + """ + 根据关键词预测事件发生的概率。 + + :param words: 分词后的单词列表 + :param event_keywords: 事件关键词列表 + :return: 事件发生的概率 + """ + word_count = Counter(words) + total_words = sum(word_count.values()) + event_word_count = sum(word_count[word] for word in event_keywords if word in word_count) + probability = event_word_count / total_words if total_words > 0 else 0 + return probability + +if __name__ == '__main__': + workbook = xlwt.Workbook(encoding='utf-8') + worksheet = workbook.add_sheet('人工智能') + + # 设置列标题 + worksheet.write(0, 0, '创建时间') + worksheet.write(0, 1, '文本') + worksheet.write(0, 2, '分词结果') + worksheet.write(0, 3, '事件概率') + + row = 1 + + all_words = [] + event_keywords = ['人工智能'] + + for page in range(1, 5): + json_data = get_single_page(page) + results = parse_page(json_data) + tmp_list = [] + + for result in results: + created = result.get('created').strip('\n') + text = result.get('text').strip('\n') + segmented_text = segment_text(text) + + # 写入创建时间和文本 + worksheet.write(row, 0, label=created) + worksheet.write(row, 1, label=text) + + # 写入分词结果 + for idx, word in enumerate(segmented_text): + worksheet.write(row + idx, 2, label=word) + + all_words.extend(segmented_text) # 添加分词结果到列表 + + # 计算事件概率 + event_probability = predict_event_probability(segmented_text, event_keywords) + worksheet.write(row, 3, label=event_probability) + + row += len(segmented_text) # 更新行号 + + time.sleep(1) # 爬取时间间隔 + workbook.save('fjt.xls') + + # 生成词云图 + generate_wordcloud(all_words)