|  |  | import requests
 | 
						
						
						
							|  |  | import re
 | 
						
						
						
							|  |  | import time
 | 
						
						
						
							|  |  | import pandas as pd  # 导入pandas库,用于数据处理和Excel文件操作
 | 
						
						
						
							|  |  | from wordcloud import WordCloud
 | 
						
						
						
							|  |  | import matplotlib.pyplot as plt
 | 
						
						
						
							|  |  | from snownlp import SnowNLP
 | 
						
						
						
							|  |  | 
 | 
						
						
						
							|  |  | # 爬取B站视频的bv号
 | 
						
						
						
							|  |  | def get_bv_and_cid(keyword, limit=300):
 | 
						
						
						
							|  |  |     headers = {
 | 
						
						
						
							|  |  |         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) ..."  # 设置请求头,模拟浏览器访问
 | 
						
						
						
							|  |  |     }
 | 
						
						
						
							|  |  |     bvid_set = set()  # 使用集合来避免重复的bv号
 | 
						
						
						
							|  |  |     for page in range(1, 15):  # 分页爬取
 | 
						
						
						
							|  |  |         if len(bvid_set) >= limit:
 | 
						
						
						
							|  |  |             break
 | 
						
						
						
							|  |  |         url = f"https://search.bilibili.com/video?keyword={keyword}&page={page}&page_size=30"
 | 
						
						
						
							|  |  |         try:
 | 
						
						
						
							|  |  |             response = requests.get(url, headers=headers)
 | 
						
						
						
							|  |  |             response.raise_for_status()  # 确保请求成功
 | 
						
						
						
							|  |  |             data = response.text
 | 
						
						
						
							|  |  |             bvid_set.update(re.findall('href="//www.bilibili.com/video/(.*?)/"', data))  # 提取bv号
 | 
						
						
						
							|  |  |         except requests.RequestException as e:
 | 
						
						
						
							|  |  |             print(f"请求错误: {e}")
 | 
						
						
						
							|  |  |     bvid_list = list(bvid_set)[:limit]  # 转换为列表并限制数量
 | 
						
						
						
							|  |  |     print("成功爬取bv号,共", len(bvid_list), "个")
 | 
						
						
						
							|  |  |     return bvid_list
 | 
						
						
						
							|  |  | 
 | 
						
						
						
							|  |  | # 根据bv号爬取对应的弹幕
 | 
						
						
						
							|  |  | def get_danmu(bvid_list):
 | 
						
						
						
							|  |  |     danmu_data = []  # 存储弹幕数据
 | 
						
						
						
							|  |  |     headers = {"User-Agent": "..."}  # 请求头
 | 
						
						
						
							|  |  |     for bv in bvid_list:
 | 
						
						
						
							|  |  |         url = f'https://www.bilibili.com/video/{bv}/'  # 构造视频页面URL
 | 
						
						
						
							|  |  |         response = requests.get(url=url, headers=headers)
 | 
						
						
						
							|  |  |         response.encoding = 'utf-8'
 | 
						
						
						
							|  |  |         oids = re.findall('"embedPlayer":{"p":.*?,"aid":.*?,"bvid":".*?","cid":(.*?),', response.text)
 | 
						
						
						
							|  |  |         print(oids)
 | 
						
						
						
							|  |  |         #爬取弹幕
 | 
						
						
						
							|  |  |         for cid in oids:
 | 
						
						
						
							|  |  |             url = f"https://comment.bilibili.com/{cid}.xml"
 | 
						
						
						
							|  |  |             response = requests.get(url=url, headers=headers)
 | 
						
						
						
							|  |  |             response.encoding = 'utf-8'
 | 
						
						
						
							|  |  |             danmu_list = re.findall('<d p=".*?">(.*?)</d>', response.text)  # 提取弹幕
 | 
						
						
						
							|  |  |             print(danmu_list)
 | 
						
						
						
							|  |  |             danmu_data.extend(danmu_list)
 | 
						
						
						
							|  |  |             time.sleep(0.5)  # 设置延时以避免过快地发送请求
 | 
						
						
						
							|  |  | 
 | 
						
						
						
							|  |  |     return danmu_data
 | 
						
						
						
							|  |  | 
 | 
						
						
						
							|  |  | # 保存弹幕数据到文本文件和Excel文件
 | 
						
						
						
							|  |  | def save_danmu(danmu_list, filename_txt="danmu.txt", filename_xlsx="danmu.xlsx"):
 | 
						
						
						
							|  |  |     with open(filename_txt, "w", encoding="utf-8") as file:
 | 
						
						
						
							|  |  |         for danmu in danmu_list:
 | 
						
						
						
							|  |  |             file.write(danmu + "\n")
 | 
						
						
						
							|  |  |     df = pd.DataFrame(danmu_list, columns=['弹幕内容'])
 | 
						
						
						
							|  |  |     df.to_excel(filename_xlsx, index=False, engine='openpyxl')
 | 
						
						
						
							|  |  | 
 | 
						
						
						
							|  |  | # 读取Excel文件并筛选包含特定关键词的弹幕
 | 
						
						
						
							|  |  | def read_and_filter_danmu(file_path):
 | 
						
						
						
							|  |  |     df = pd.read_excel(file_path, engine='openpyxl')
 | 
						
						
						
							|  |  |     keywords = ['AI', 'AI技术', 'AI技术应用', '人工智能应用', '人工智能', '机器学习', '生成']
 | 
						
						
						
							|  |  |     filtered_danmu = df[df['弹幕内容'].str.contains('|'.join(keywords), na=False, case=False)]
 | 
						
						
						
							|  |  |     return filtered_danmu
 | 
						
						
						
							|  |  | 
 | 
						
						
						
							|  |  | # 对筛选后的弹幕进行计数和排序,获取最常见的弹幕
 | 
						
						
						
							|  |  | def count_and_sort_danmu(danmu_df):
 | 
						
						
						
							|  |  |     danmu_counts = danmu_df['弹幕内容'].value_counts()
 | 
						
						
						
							|  |  |     top_danmu = danmu_counts.head(8)
 | 
						
						
						
							|  |  |     return top_danmu
 | 
						
						
						
							|  |  | 
 | 
						
						
						
							|  |  | # 保存最常见的弹幕到Excel文件
 | 
						
						
						
							|  |  | def save_top_danmu(top_danmu, output_file):
 | 
						
						
						
							|  |  |     top_danmu_df = pd.DataFrame(top_danmu).reset_index()
 | 
						
						
						
							|  |  |     top_danmu_df.columns = ['弹幕内容', '数量']
 | 
						
						
						
							|  |  |     top_danmu_df.to_excel(output_file, index=False, engine='openpyxl')
 | 
						
						
						
							|  |  |     print(f"Top danmu saved to {output_file}")
 | 
						
						
						
							|  |  | 
 | 
						
						
						
							|  |  | # 生成词云图片
 | 
						
						
						
							|  |  | def generate_wordcloud(danmu_list, output_image_path):
 | 
						
						
						
							|  |  |     text = ' '.join(danmu_list)
 | 
						
						
						
							|  |  |     wordcloud = WordCloud(font_path='msyh.ttc', background_color='white', width=800, height=600, min_font_size=10)
 | 
						
						
						
							|  |  |     wordcloud.generate(text)
 | 
						
						
						
							|  |  |     plt.figure(figsize=(10, 8))
 | 
						
						
						
							|  |  |     plt.imshow(wordcloud, interpolation='bilinear')
 | 
						
						
						
							|  |  |     plt.axis('off')
 | 
						
						
						
							|  |  |     plt.savefig(output_image_path, dpi=300)
 | 
						
						
						
							|  |  |     plt.show()
 | 
						
						
						
							|  |  | 
 | 
						
						
						
							|  |  | # 使用snownlp库进行情感分析
 | 
						
						
						
							|  |  | def analyze_sentiment(danmu_list):
 | 
						
						
						
							|  |  |     sentiment_results = []
 | 
						
						
						
							|  |  |     for danmu in danmu_list:
 | 
						
						
						
							|  |  |         s = SnowNLP(danmu)
 | 
						
						
						
							|  |  |         sentiment_score = s.sentiments
 | 
						
						
						
							|  |  |         sentiment_results.append(sentiment_score)
 | 
						
						
						
							|  |  |     return sentiment_results
 | 
						
						
						
							|  |  | 
 | 
						
						
						
							|  |  | if __name__ == "__main__":
 | 
						
						
						
							|  |  |     keyword = "2024巴黎奥运会"
 | 
						
						
						
							|  |  |     bvid_list = get_bv_and_cid(keyword)
 | 
						
						
						
							|  |  |     danmu_list = get_danmu(bvid_list)
 | 
						
						
						
							|  |  |     save_danmu(danmu_list)
 | 
						
						
						
							|  |  | 
 | 
						
						
						
							|  |  |     file_path = 'danmu.xlsx'
 | 
						
						
						
							|  |  |     filtered_danmu = read_and_filter_danmu(file_path)
 | 
						
						
						
							|  |  |     top_danmu = count_and_sort_danmu(filtered_danmu)
 | 
						
						
						
							|  |  | 
 | 
						
						
						
							|  |  |     output_file = 'top_danmu.xlsx'
 | 
						
						
						
							|  |  |     save_top_danmu(top_danmu, output_file)
 | 
						
						
						
							|  |  | 
 | 
						
						
						
							|  |  |     file_path = 'top_danmu.xlsx'
 | 
						
						
						
							|  |  |     top_danmu_df = pd.read_excel(file_path, engine='openpyxl')
 | 
						
						
						
							|  |  | 
 | 
						
						
						
							|  |  |     top_list = top_danmu_df['弹幕内容'].tolist()
 | 
						
						
						
							|  |  |     if not top_list:
 | 
						
						
						
							|  |  |         print("筛选后的弹幕为空,跳过生成词云的步骤。")
 | 
						
						
						
							|  |  |     else:
 | 
						
						
						
							|  |  |         output_image_path = 'wordcloud.png'
 | 
						
						
						
							|  |  |         generate_wordcloud(top_list, output_image_path)
 | 
						
						
						
							|  |  |         sentiments = analyze_sentiment(top_list)
 | 
						
						
						
							|  |  |         print("情感分析结果:", sentiments) |