You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

115 lines
4.9 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import requests
import re
import time
from collections import Counter
import pandas as pd # 导入pandas库用于数据处理
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import jieba
query = "2024巴黎奥运会" # 设置查询关键词
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
}
count = 300
total_page = 10
cid_pattern = re.compile(r'"cid":(\d+)') # 编译正则表达式以提取cid
total_cid_list = []
total_comment_dict = {}
bvid_pattern = re.compile(r'bvid:"(.*?)"') # 编译正则表达式以提取bvid
sorted_comment_dict = {}
def GetFirstBidUrl(): # 获取第一个搜索结果的URL
return "https://search.bilibili.com/all?vt=82099157&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=5&page=2&o=36"
def fetch(url): # 定义获取网页内容的函数
try:
response = requests.get(url, headers=headers)
response.raise_for_status()
return response
except requests.RequestException as e:
print(f"请求错误: {e}")
return None
def GetCid(): # 获取视频的cid
for page in range(1, total_page + 1):
if len(total_cid_list) >= count:
break
print(f"处理第{page}页...") # 输出当前处理页码
search_url = GetFirstBidUrl() if page == 1 else f"https://search.bilibili.com/all?vt=82451961&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=5&page={page}&o=36"
response = fetch(search_url)
if response is None:
continue
current_bvid_list = bvid_pattern.findall(response.text) # 从响应中提取bvid
for bvid in current_bvid_list: # 遍历每个bvid
video_url = f"https://www.bilibili.com/video/{bvid}"
response = fetch(video_url)
if response is None:
continue
current_cid = cid_pattern.search(response.text).group(1) # 从视频页面提取cid
print(f"获取到cid: {current_cid}") # 输出获取到的cid
total_cid_list.append(current_cid)
if len(total_cid_list) >= count:
break
time.sleep(1) # 暂停1秒避免请求频率过快
def GetDanmu(): # 获取弹幕
for index, cid in enumerate(total_cid_list): # 遍历每个cid
print(f"正在获取第{index + 1}个视频的弹幕") # 输出当前处理的弹幕索引
danmu_url = f"https://api.bilibili.com/x/v1/dm/list.so?oid={cid}" # 构造弹幕API的URL
response = fetch(danmu_url)
if response is None:
continue
response.encoding = 'utf-8' # 设置响应编码为utf-8
current_danmu_list = re.findall('<d p=".*?">(.*?)</d>', response.text) # 提取弹幕内容
current_comment_dict = Counter(current_danmu_list) # 统计当前弹幕的频率
for k, v in current_comment_dict.items(): # 遍历当前弹幕统计
total_comment_dict[k] = total_comment_dict.get(k, 0) + v # 更新总弹幕字典
time.sleep(0.5) # 暂停0.5秒,避免请求频率过快
def SortDanmu(): # 对弹幕进行排序和过滤
global sorted_comment_dict
ai_pattern = re.compile(r'(?<![a-zA-Z])ai(?![a-zA-Z])|人工智能', re.IGNORECASE) # 编译正则表达式匹配“ai”或“人工智能”
ai_comment = {k: v for k, v in total_comment_dict.items() if ai_pattern.search(k)} # 过滤出相关评论
sorted_comment_dict = dict(sorted(ai_comment.items(), key=lambda x: x[1], reverse=True)) # 按频率排序评论
print(sorted_comment_dict) # 输出排序后的评论字典
df = pd.DataFrame(list(sorted_comment_dict.items()), columns=['Comment', 'Count']) # 转换为DataFrame
df.to_excel('comments.xlsx', index=False) # 保存为Excel文件
print("已成功写入Excel文件")
def CreateWordCloud():
comment_text = ' '.join([''.join(jieba.cut(k, cut_all=False)) for k in sorted_comment_dict.keys()])
wordcloud = WordCloud(
font_path='C:/Windows/Fonts/simsun.ttc',
width=800, height=400,
background_color='white',
max_words=200,
colormap='viridis'
).generate(comment_text)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear') # 显示词云图
plt.axis('off')
plt.show() # 展示绘图
def main(): # 主函数
GetCid() # 获取cid
GetDanmu() # 获取弹幕
SortDanmu() # 排序弹幕
CreateWordCloud() # 创建词云
if __name__ == "__main__":
main()