import requests from urllib.parse import urlencode from pyquery import PyQuery as pq import time import xlwt import jieba from wordcloud import WordCloud import matplotlib.pyplot as plt from collections import Counter # 设置代理等(新浪微博的数据是用ajax异步下拉加载的,network->xhr) host = 'm.weibo.cn' base_url = f'https://{host}/api/container/getIndex?' # 设置请求头 headers = { 'Host': host, 'Referer': 'https://m.weibo.cn/search?containerid=231522type%3D1%26q%3D%23%E7%BE%8E%E5%9B%BD%E7%96%AB%E6%83%85%23', 'User-Agent': 'Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Mobile Safari/537.36' } # 按页数抓取数据 def get_single_page(page): """ 获取单个页面的数据。 :param page: 页面编号 :return: JSON 数据 """ params = { 'containerid': '231522type=1&q=#人工智能#', 'page_type': 'searchall', 'page': page } url = base_url + urlencode(params) try: response = requests.get(url, headers=headers) if response.status_code == 200: return response.json() except requests.ConnectionError as e: print('抓取错误:', e.args) # 解析页面返回的JSON数据 count = 0 def parse_page(json_data): """ 解析页面返回的JSON数据。 :param json_data: JSON 数据 :return: 解析后的数据 """ global count items = json_data.get('data').get('cards') for item in items: mblog = item.get('mblog') if mblog: data = { 'id': mblog.get('id'), 'created': mblog.get('created_at'), 'text': pq(mblog.get('text')).text() # 提取内容中的文本 } yield data count += 1 def segment_text(text): """ 对文本进行分词处理。 :param text: 输入文本 :return: 分词后的列表 """ seg_list = jieba.cut(text.strip()) return list(seg_list) def generate_wordcloud(words): """ 生成词云图。 :param words: 分词后的单词列表 """ wordcloud = WordCloud( font_path='simhei.ttf', # 指定字体路径 background_color='white', width=800, height=600 ).generate(' '.join(words)) # 显示词云图 plt.imshow(wordcloud, interpolation='bilinear') plt.axis('off') plt.show() # 保存词云图 wordcloud.to_file('fjt.png') def predict_event_probability(words, event_keywords): """ 根据关键词预测事件发生的概率。 :param words: 分词后的单词列表 :param event_keywords: 事件关键词列表 :return: 事件发生的概率 """ word_count = Counter(words) total_words = sum(word_count.values()) event_word_count = sum(word_count[word] for word in event_keywords if word in word_count) probability = event_word_count / total_words if total_words > 0 else 0 return probability if __name__ == '__main__': workbook = xlwt.Workbook(encoding='utf-8') worksheet = workbook.add_sheet('人工智能') # 设置列标题 worksheet.write(0, 0, '创建时间') worksheet.write(0, 1, '文本') worksheet.write(0, 2, '分词结果') worksheet.write(0, 3, '事件概率') row = 1 all_words = [] event_keywords = ['人工智能'] for page in range(1, 5): json_data = get_single_page(page) results = parse_page(json_data) tmp_list = [] for result in results: created = result.get('created').strip('\n') text = result.get('text').strip('\n') segmented_text = segment_text(text) # 写入创建时间和文本 worksheet.write(row, 0, label=created) worksheet.write(row, 1, label=text) # 写入分词结果 for idx, word in enumerate(segmented_text): worksheet.write(row + idx, 2, label=word) all_words.extend(segmented_text) # 添加分词结果到列表 # 计算事件概率 event_probability = predict_event_probability(segmented_text, event_keywords) worksheet.write(row, 3, label=event_probability) row += len(segmented_text) # 更新行号 time.sleep(1) # 爬取时间间隔 workbook.save('fjt.xls') # 生成词云图 generate_wordcloud(all_words)