|
|
@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
import requests
|
|
|
|
|
|
|
|
import re
|
|
|
|
|
|
|
|
import time
|
|
|
|
|
|
|
|
from collections import Counter
|
|
|
|
|
|
|
|
import pandas as pd # 导入pandas库用于数据处理
|
|
|
|
|
|
|
|
from wordcloud import WordCloud
|
|
|
|
|
|
|
|
import matplotlib.pyplot as plt
|
|
|
|
|
|
|
|
import jieba
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
query = "2024巴黎奥运会" # 设置查询关键词
|
|
|
|
|
|
|
|
headers = {
|
|
|
|
|
|
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
count = 300
|
|
|
|
|
|
|
|
total_page = 10
|
|
|
|
|
|
|
|
cid_pattern = re.compile(r'"cid":(\d+)') # 编译正则表达式以提取cid
|
|
|
|
|
|
|
|
total_cid_list = []
|
|
|
|
|
|
|
|
total_comment_dict = {}
|
|
|
|
|
|
|
|
bvid_pattern = re.compile(r'bvid:"(.*?)"') # 编译正则表达式以提取bvid
|
|
|
|
|
|
|
|
sorted_comment_dict = {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def GetFirstBidUrl(): # 获取第一个搜索结果的URL
|
|
|
|
|
|
|
|
return "https://search.bilibili.com/all?vt=82099157&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=5&page=2&o=36"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def fetch(url): # 定义获取网页内容的函数
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
|
|
response = requests.get(url, headers=headers)
|
|
|
|
|
|
|
|
response.raise_for_status()
|
|
|
|
|
|
|
|
return response
|
|
|
|
|
|
|
|
except requests.RequestException as e:
|
|
|
|
|
|
|
|
print(f"请求错误: {e}")
|
|
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def GetCid(): # 获取视频的cid
|
|
|
|
|
|
|
|
for page in range(1, total_page + 1):
|
|
|
|
|
|
|
|
if len(total_cid_list) >= count:
|
|
|
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print(f"处理第{page}页...") # 输出当前处理页码
|
|
|
|
|
|
|
|
search_url = GetFirstBidUrl() if page == 1 else f"https://search.bilibili.com/all?vt=82451961&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=5&page={page}&o=36"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
response = fetch(search_url)
|
|
|
|
|
|
|
|
if response is None:
|
|
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
current_bvid_list = bvid_pattern.findall(response.text) # 从响应中提取bvid
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for bvid in current_bvid_list: # 遍历每个bvid
|
|
|
|
|
|
|
|
video_url = f"https://www.bilibili.com/video/{bvid}"
|
|
|
|
|
|
|
|
response = fetch(video_url)
|
|
|
|
|
|
|
|
if response is None:
|
|
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
current_cid = cid_pattern.search(response.text).group(1) # 从视频页面提取cid
|
|
|
|
|
|
|
|
print(f"获取到cid: {current_cid}") # 输出获取到的cid
|
|
|
|
|
|
|
|
total_cid_list.append(current_cid)
|
|
|
|
|
|
|
|
if len(total_cid_list) >= count:
|
|
|
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
time.sleep(1) # 暂停1秒,避免请求频率过快
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def GetDanmu(): # 获取弹幕
|
|
|
|
|
|
|
|
for index, cid in enumerate(total_cid_list): # 遍历每个cid
|
|
|
|
|
|
|
|
print(f"正在获取第{index + 1}个视频的弹幕") # 输出当前处理的弹幕索引
|
|
|
|
|
|
|
|
danmu_url = f"https://api.bilibili.com/x/v1/dm/list.so?oid={cid}" # 构造弹幕API的URL
|
|
|
|
|
|
|
|
response = fetch(danmu_url)
|
|
|
|
|
|
|
|
if response is None:
|
|
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
response.encoding = 'utf-8' # 设置响应编码为utf-8
|
|
|
|
|
|
|
|
current_danmu_list = re.findall('<d p=".*?">(.*?)</d>', response.text) # 提取弹幕内容
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
current_comment_dict = Counter(current_danmu_list) # 统计当前弹幕的频率
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for k, v in current_comment_dict.items(): # 遍历当前弹幕统计
|
|
|
|
|
|
|
|
total_comment_dict[k] = total_comment_dict.get(k, 0) + v # 更新总弹幕字典
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
time.sleep(0.5) # 暂停0.5秒,避免请求频率过快
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def SortDanmu(): # 对弹幕进行排序和过滤
|
|
|
|
|
|
|
|
global sorted_comment_dict
|
|
|
|
|
|
|
|
ai_pattern = re.compile(r'(?<![a-zA-Z])ai(?![a-zA-Z])|人工智能', re.IGNORECASE) # 编译正则表达式匹配“ai”或“人工智能”
|
|
|
|
|
|
|
|
ai_comment = {k: v for k, v in total_comment_dict.items() if ai_pattern.search(k)} # 过滤出相关评论
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sorted_comment_dict = dict(sorted(ai_comment.items(), key=lambda x: x[1], reverse=True)) # 按频率排序评论
|
|
|
|
|
|
|
|
print(sorted_comment_dict) # 输出排序后的评论字典
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
df = pd.DataFrame(list(sorted_comment_dict.items()), columns=['Comment', 'Count']) # 转换为DataFrame
|
|
|
|
|
|
|
|
df.to_excel('comments.xlsx', index=False) # 保存为Excel文件
|
|
|
|
|
|
|
|
print("已成功写入Excel文件")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def CreateWordCloud():
|
|
|
|
|
|
|
|
comment_text = ' '.join([''.join(jieba.cut(k, cut_all=False)) for k in sorted_comment_dict.keys()])
|
|
|
|
|
|
|
|
wordcloud = WordCloud(
|
|
|
|
|
|
|
|
font_path='C:/Windows/Fonts/simsun.ttc',
|
|
|
|
|
|
|
|
width=800, height=400,
|
|
|
|
|
|
|
|
background_color='white',
|
|
|
|
|
|
|
|
max_words=200,
|
|
|
|
|
|
|
|
colormap='viridis'
|
|
|
|
|
|
|
|
).generate(comment_text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
plt.figure(figsize=(10, 5))
|
|
|
|
|
|
|
|
plt.imshow(wordcloud, interpolation='bilinear') # 显示词云图
|
|
|
|
|
|
|
|
plt.axis('off')
|
|
|
|
|
|
|
|
plt.show() # 展示绘图
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main(): # 主函数
|
|
|
|
|
|
|
|
GetCid() # 获取cid
|
|
|
|
|
|
|
|
GetDanmu() # 获取弹幕
|
|
|
|
|
|
|
|
SortDanmu() # 排序弹幕
|
|
|
|
|
|
|
|
CreateWordCloud() # 创建词云
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
|
|
|
main()
|