|
|
import re
|
|
|
import requests
|
|
|
import json
|
|
|
from wordcloud import WordCloud
|
|
|
from collections import Counter
|
|
|
import pandas as pd
|
|
|
import jieba
|
|
|
|
|
|
# 用于获得请求的头,应对B站的反爬虫机制
|
|
|
headers = {
|
|
|
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0",
|
|
|
"Referer": "https://www.bilibili.com/video/BV1ji421r7PB/?spm_id_from=333.337.search-card.all.click&vd_source=6d043574d81a3f216c41cc281767c59e",
|
|
|
"Cookie": "buvid4=F793982E-B1BE-6615-D504-4A3BF6DC079274572-022081216-zrNi2ZH69Dhc8fOw09IIaA%3D%3D; buvid_fp_plain=undefined; DedeUserID=254629681; DedeUserID__ckMd5=23182279794f6f43; header_theme_version=CLOSE; enable_web_push=DISABLE; rpdid=0zbfVHb84m|bujW4qg1|H3r|3w1RkXNz; FEED_LIVE_VERSION=V_WATCHLATER_PIP_WINDOW3; hit-dyn-v2=1; CURRENT_FNVAL=4048; LIVE_BUVID=AUTO3317219069887705; buvid3=575DAC67-0FE1-B3C3-8A9F-32190F231A1F24035infoc; b_nut=1723377925; _uuid=24521A4B-895E-10E6A-2555-A9EF812A1AC136123infoc; fingerprint=45b7e451fad392b858fc6e95bbb4231d; PVID=1; SESSDATA=7bb24682%2C1741874111%2Cf9490%2A92CjB3NDHhJCIr34LBlg2TTS7vPGQ72Kh3s3DVcaEn0JkQ_hXc7q6MgsK9Wjb-Z6NTf68SVnlnTkJQZmtmRXMxZnc1RDZpYUFnS1BlTmptUklUSkcyQUNxaWg4Smpla1VCZHdkLVc4dW9OWU4zZm1PaVhuMzFtcnNvQ2JMQzFiTFVleC1HbXc0Sm53IIEC; bili_jct=895f9e091815ee492e57345dc62b8757; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY1ODEzMjMsImlhdCI6MTcyNjMyMjA2MywicGx0IjotMX0.47Js9n97GQecalPWdc-MNrDVUOz2-FfY_DlB72DUlZM; bili_ticket_expires=1726581263; CURRENT_QUALITY=32; bp_t_offset_254629681=977402825627664384; buvid_fp=45b7e451fad392b858fc6e95bbb4231d; home_feed_column=5; browser_resolution=1528-704; b_lsid=18C6BCCB_191F83A85C1"
|
|
|
}
|
|
|
|
|
|
def get_bvlist():
|
|
|
# 获取BV号
|
|
|
bvlist = []
|
|
|
for page in range(1, 11):
|
|
|
# 搜索“2024巴黎奥运会”的URL
|
|
|
url = f'https://api.bilibili.com/x/web-interface/wbi/search/type?category_id=&search_type=video&ad_resource=5654&__refresh__=true&_extra=&context=&page={page}&page_size=30&pubtime_begin_s=0&pubtime_end_s=0&from_source=&from_spmid=333.337&platform=pc&highlight=1&single_column=0&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A''
|
|
|
# 发送请求
|
|
|
response = requests.get(url, headers=headers)
|
|
|
response.encoding='urf-8'
|
|
|
# 使用正则表达式得到BV号
|
|
|
tem_bvlist = re.findall('"bvid":"(.*?)"', response.text)
|
|
|
bvlist.extend(list(set(tem_bvlist)))
|
|
|
print("BV号获取完成")
|
|
|
return bvlist
|
|
|
|
|
|
def get_cidlist(bvlist):
|
|
|
# 传入获取到的BV号,获取对应的cid
|
|
|
cidlist=[]
|
|
|
for bvid in bvlist:
|
|
|
# 由于获取cid的URL
|
|
|
url = f'https://api.bilibili.com/x/player/pagelist?bvid={bvid}&jsonp=jsonp'
|
|
|
# 发送请求
|
|
|
res = requests.get(url,headers=headers)
|
|
|
res.encoding = 'urf-8'
|
|
|
# 利用JSON提取cid
|
|
|
json_dict = json.loads(res.text)
|
|
|
cidlist.append(json_dict["data"][0]["cid"])
|
|
|
print("cid获取完成")
|
|
|
return cidlist
|
|
|
|
|
|
def get_danmu(cidlist):
|
|
|
# 利用传入的cid,获取对应的弹幕
|
|
|
count = 1
|
|
|
for cid in cidlist:
|
|
|
# 弹幕所在的URL
|
|
|
url = f'http://comment.bilibili.com/{cid}.xml'
|
|
|
# 发送请求
|
|
|
response = requests.get(url, headers=headers)
|
|
|
response.encoding = 'utf-8'
|
|
|
# 利用正则表达式获取弹幕
|
|
|
content_list = re.findall('<d p=".*?">(.*?)</d>', response.text)
|
|
|
for danmu in content_list:
|
|
|
with open('弹幕.txt', 'a', encoding='utf-8') as f:
|
|
|
f.write(danmu + '\n')
|
|
|
print(f'----------第{count}个视频弹幕获取完毕!----------')
|
|
|
count +=1
|
|
|
|
|
|
def get_ai_related_danmu():
|
|
|
# 统计与AI技术应用相关的弹幕
|
|
|
ai_related_danmu = []
|
|
|
# AI相关的关键词列表
|
|
|
ai_keywords = {'ai', 'ai生成', 'ai合成', 'ai配音', 'gpt', '人工智能', '机器学习', '深度学习', '神经网络',
|
|
|
'自然语言处理', '计算机视觉', '语音识别', '人脸识别', '无人驾驶'}
|
|
|
# 读取弹幕文件
|
|
|
with open('弹幕.txt', 'r', encoding='utf-8') as file:
|
|
|
danmu_lines = file.readlines()
|
|
|
# 去除原本的换行符和空格,方便统计
|
|
|
danmu_cleaned = [line.strip() for line in danmu_lines]
|
|
|
# 统计与AI技术应用相关的弹幕
|
|
|
for danmu in danmu_cleaned:
|
|
|
if any(keyword in danmu.lower() for keyword in ai_keywords):
|
|
|
# 对含有“ai”关键字但内容并不是AI技术应用相关的弹幕需另外判断
|
|
|
if 'ai' in danmu.lower() and re.search(r'(?<![\u4e00-\u9fa5])[aiAI](?![\u4e00-\u9fa5])', danmu):
|
|
|
continue
|
|
|
ai_related_danmu.append(danmu)
|
|
|
|
|
|
# 统计每种弹幕的数量
|
|
|
danmu_count = Counter(ai_related_danmu)
|
|
|
|
|
|
# 获取数量排名前8的弹幕
|
|
|
top_8_danmu = danmu_count.most_common(8)
|
|
|
|
|
|
# 创建一个DataFrame来存储数据
|
|
|
df = pd.DataFrame(top_8_danmu, columns=['弹幕内容', '数量'])
|
|
|
|
|
|
# 将DataFrame写入Excel文件
|
|
|
df.to_excel('AI弹幕统计.xlsx', index=False)
|
|
|
return(ai_related_danmu)
|
|
|
|
|
|
def get_wordcloud(ai_danmu):
|
|
|
# 首先,将弹幕列表中的每个元素连接成一个长字符串,元素之间用空格分隔
|
|
|
ai = ' '.join(ai_danmu)
|
|
|
# 使用jieba分词对连接后的字符串进行分词,并将分词结果再次连接成一个长字符串,分词之间用空格分隔
|
|
|
cut_t = ' '.join(jieba.cut(ai))
|
|
|
# 设置停用词,减少无效信息
|
|
|
stop_words = set()
|
|
|
content = [line.strip() for line in open('停用词.txt', 'r', encoding='utf-8').readlines()]
|
|
|
stop_words.update(content)
|
|
|
# 创建一个WordCloud对象,设置词云图的参数
|
|
|
wordcloud = WordCloud(background_color="white", # 设置背景颜色为白色
|
|
|
width=800, # 设置词云图的宽度
|
|
|
height=600, # 设置词云图的高度
|
|
|
max_words=150, # 设置词云图中显示的最大单词数
|
|
|
max_font_size=80, # 设置词云图中最大字号
|
|
|
font_path='msyh.ttc', # 设置字体路径,确保能够显示中文
|
|
|
contour_width=2, # 设置词云图的轮廓宽度
|
|
|
contour_color='steelblue', # 设置词云图轮廓的颜色
|
|
|
repeat='true', # 设置单词课重复出现
|
|
|
stopwords = stop_words, # 设置停用词
|
|
|
).generate(cut_t)
|
|
|
# 将生成的词云图保存为PNG文件
|
|
|
wordcloud.to_file('词云图.png')
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
bvlist = get_bvlist()
|
|
|
cidlist = get_cidlist(bvlist)
|
|
|
get_danmu(cidlist)
|
|
|
ai_danmu = get_ai_related_danmu()
|
|
|
get_wordcloud(ai_danmu) |