import requests import re import time import pandas as pd # 导入pandas库,用于数据处理和Excel文件操作 from wordcloud import WordCloud import matplotlib.pyplot as plt from snownlp import SnowNLP # 爬取B站视频的bv号 def get_bv_and_cid(keyword, limit=300): headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) ..." # 设置请求头,模拟浏览器访问 } bvid_set = set() # 使用集合来避免重复的bv号 for page in range(1, 15): # 分页爬取 if len(bvid_set) >= limit: break url = f"https://search.bilibili.com/video?keyword={keyword}&page={page}&page_size=30" try: response = requests.get(url, headers=headers) response.raise_for_status() # 确保请求成功 data = response.text bvid_set.update(re.findall('href="//www.bilibili.com/video/(.*?)/"', data)) # 提取bv号 except requests.RequestException as e: print(f"请求错误: {e}") bvid_list = list(bvid_set)[:limit] # 转换为列表并限制数量 print("成功爬取bv号,共", len(bvid_list), "个") return bvid_list # 根据bv号爬取对应的弹幕 def get_danmu(bvid_list): danmu_data = [] # 存储弹幕数据 headers = {"User-Agent": "..."} # 请求头 for bv in bvid_list: url = f'https://www.bilibili.com/video/{bv}/' # 构造视频页面URL response = requests.get(url=url, headers=headers) response.encoding = 'utf-8' oids = re.findall('"embedPlayer":{"p":.*?,"aid":.*?,"bvid":".*?","cid":(.*?),', response.text) print(oids) #爬取弹幕 for cid in oids: url = f"https://comment.bilibili.com/{cid}.xml" response = requests.get(url=url, headers=headers) response.encoding = 'utf-8' danmu_list = re.findall('(.*?)', response.text) # 提取弹幕 print(danmu_list) danmu_data.extend(danmu_list) time.sleep(0.5) # 设置延时以避免过快地发送请求 return danmu_data # 保存弹幕数据到文本文件和Excel文件 def save_danmu(danmu_list, filename_txt="danmu.txt", filename_xlsx="danmu.xlsx"): with open(filename_txt, "w", encoding="utf-8") as file: for danmu in danmu_list: file.write(danmu + "\n") df = pd.DataFrame(danmu_list, columns=['弹幕内容']) df.to_excel(filename_xlsx, index=False, engine='openpyxl') # 读取Excel文件并筛选包含特定关键词的弹幕 def read_and_filter_danmu(file_path): df = pd.read_excel(file_path, engine='openpyxl') keywords = ['AI', 'AI技术', 'AI技术应用', '人工智能应用', '人工智能', '机器学习', '生成'] filtered_danmu = df[df['弹幕内容'].str.contains('|'.join(keywords), na=False, case=False)] return filtered_danmu # 对筛选后的弹幕进行计数和排序,获取最常见的弹幕 def count_and_sort_danmu(danmu_df): danmu_counts = danmu_df['弹幕内容'].value_counts() top_danmu = danmu_counts.head(8) return top_danmu # 保存最常见的弹幕到Excel文件 def save_top_danmu(top_danmu, output_file): top_danmu_df = pd.DataFrame(top_danmu).reset_index() top_danmu_df.columns = ['弹幕内容', '数量'] top_danmu_df.to_excel(output_file, index=False, engine='openpyxl') print(f"Top danmu saved to {output_file}") # 生成词云图片 def generate_wordcloud(danmu_list, output_image_path): text = ' '.join(danmu_list) wordcloud = WordCloud(font_path='msyh.ttc', background_color='white', width=800, height=600, min_font_size=10) wordcloud.generate(text) plt.figure(figsize=(10, 8)) plt.imshow(wordcloud, interpolation='bilinear') plt.axis('off') plt.savefig(output_image_path, dpi=300) plt.show() # 使用snownlp库进行情感分析 def analyze_sentiment(danmu_list): sentiment_results = [] for danmu in danmu_list: s = SnowNLP(danmu) sentiment_score = s.sentiments sentiment_results.append(sentiment_score) return sentiment_results if __name__ == "__main__": keyword = "2024巴黎奥运会" bvid_list = get_bv_and_cid(keyword) danmu_list = get_danmu(bvid_list) save_danmu(danmu_list) file_path = 'danmu.xlsx' filtered_danmu = read_and_filter_danmu(file_path) top_danmu = count_and_sort_danmu(filtered_danmu) output_file = 'top_danmu.xlsx' save_top_danmu(top_danmu, output_file) file_path = 'top_danmu.xlsx' top_danmu_df = pd.read_excel(file_path, engine='openpyxl') top_list = top_danmu_df['弹幕内容'].tolist() if not top_list: print("筛选后的弹幕为空,跳过生成词云的步骤。") else: output_image_path = 'wordcloud.png' generate_wordcloud(top_list, output_image_path) sentiments = analyze_sentiment(top_list) print("情感分析结果:", sentiments)