You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

124 lines
6.8 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import re
import requests
import json
from wordcloud import WordCloud
from collections import Counter
import pandas as pd
import jieba
# 用于获得请求的头应对B站的反爬虫机制
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0",
"Referer": "https://www.bilibili.com/video/BV1ji421r7PB/?spm_id_from=333.337.search-card.all.click&vd_source=6d043574d81a3f216c41cc281767c59e",
"Cookie": "buvid4=F793982E-B1BE-6615-D504-4A3BF6DC079274572-022081216-zrNi2ZH69Dhc8fOw09IIaA%3D%3D; buvid_fp_plain=undefined; DedeUserID=254629681; DedeUserID__ckMd5=23182279794f6f43; header_theme_version=CLOSE; enable_web_push=DISABLE; rpdid=0zbfVHb84m|bujW4qg1|H3r|3w1RkXNz; FEED_LIVE_VERSION=V_WATCHLATER_PIP_WINDOW3; hit-dyn-v2=1; CURRENT_FNVAL=4048; LIVE_BUVID=AUTO3317219069887705; buvid3=575DAC67-0FE1-B3C3-8A9F-32190F231A1F24035infoc; b_nut=1723377925; _uuid=24521A4B-895E-10E6A-2555-A9EF812A1AC136123infoc; fingerprint=45b7e451fad392b858fc6e95bbb4231d; PVID=1; SESSDATA=7bb24682%2C1741874111%2Cf9490%2A92CjB3NDHhJCIr34LBlg2TTS7vPGQ72Kh3s3DVcaEn0JkQ_hXc7q6MgsK9Wjb-Z6NTf68SVnlnTkJQZmtmRXMxZnc1RDZpYUFnS1BlTmptUklUSkcyQUNxaWg4Smpla1VCZHdkLVc4dW9OWU4zZm1PaVhuMzFtcnNvQ2JMQzFiTFVleC1HbXc0Sm53IIEC; bili_jct=895f9e091815ee492e57345dc62b8757; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY1ODEzMjMsImlhdCI6MTcyNjMyMjA2MywicGx0IjotMX0.47Js9n97GQecalPWdc-MNrDVUOz2-FfY_DlB72DUlZM; bili_ticket_expires=1726581263; CURRENT_QUALITY=32; bp_t_offset_254629681=977402825627664384; buvid_fp=45b7e451fad392b858fc6e95bbb4231d; home_feed_column=5; browser_resolution=1528-704; b_lsid=18C6BCCB_191F83A85C1"
}
def get_bvlist():
# 获取BV号
bvlist = []
for page in range(1, 11):
# 搜索“2024巴黎奥运会”的URL
url = f'https://api.bilibili.com/x/web-interface/wbi/search/type?category_id=&search_type=video&ad_resource=5654&__refresh__=true&_extra=&context=&page={page}&page_size=30&pubtime_begin_s=0&pubtime_end_s=0&from_source=&from_spmid=333.337&platform=pc&highlight=1&single_column=0&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&#39'
# 发送请求
response = requests.get(url, headers=headers)
response.encoding='urf-8'
# 使用正则表达式得到BV号
tem_bvlist = re.findall('"bvid":"(.*?)"', response.text)
bvlist.extend(list(set(tem_bvlist)))
print("BV号获取完成")
return bvlist
def get_cidlist(bvlist):
# 传入获取到的BV号获取对应的cid
cidlist=[]
for bvid in bvlist:
# 由于获取cid的URL
url = f'https://api.bilibili.com/x/player/pagelist?bvid={bvid}&jsonp=jsonp'
# 发送请求
res = requests.get(url,headers=headers)
res.encoding = 'urf-8'
# 利用JSON提取cid
json_dict = json.loads(res.text)
cidlist.append(json_dict["data"][0]["cid"])
print("cid获取完成")
return cidlist
def get_danmu(cidlist):
# 利用传入的cid获取对应的弹幕
count = 1
for cid in cidlist:
# 弹幕所在的URL
url = f'http://comment.bilibili.com/{cid}.xml'
# 发送请求
response = requests.get(url, headers=headers)
response.encoding = 'utf-8'
# 利用正则表达式获取弹幕
content_list = re.findall('<d p=".*?">(.*?)</d>', response.text)
for danmu in content_list:
with open('弹幕.txt', 'a', encoding='utf-8') as f:
f.write(danmu + '\n')
print(f'----------第{count}个视频弹幕获取完毕!----------')
count +=1
def get_ai_related_danmu():
# 统计与AI技术应用相关的弹幕
ai_related_danmu = []
# AI相关的关键词列表
ai_keywords = {'ai', 'ai生成', 'ai合成', 'ai配音', 'gpt', '人工智能', '机器学习', '深度学习', '神经网络',
'自然语言处理', '计算机视觉', '语音识别', '人脸识别', '无人驾驶'}
# 读取弹幕文件
with open('弹幕.txt', 'r', encoding='utf-8') as file:
danmu_lines = file.readlines()
# 去除原本的换行符和空格,方便统计
danmu_cleaned = [line.strip() for line in danmu_lines]
# 统计与AI技术应用相关的弹幕
for danmu in danmu_cleaned:
if any(keyword in danmu.lower() for keyword in ai_keywords):
# 对含有“ai”关键字但内容并不是AI技术应用相关的弹幕需另外判断
if 'ai' in danmu.lower() and re.search(r'(?<![\u4e00-\u9fa5])[aiAI](?![\u4e00-\u9fa5])', danmu):
continue
ai_related_danmu.append(danmu)
# 统计每种弹幕的数量
danmu_count = Counter(ai_related_danmu)
# 获取数量排名前8的弹幕
top_8_danmu = danmu_count.most_common(8)
# 创建一个DataFrame来存储数据
df = pd.DataFrame(top_8_danmu, columns=['弹幕内容', '数量'])
# 将DataFrame写入Excel文件
df.to_excel('AI弹幕统计.xlsx', index=False)
return(ai_related_danmu)
def get_wordcloud(ai_danmu):
# 首先,将弹幕列表中的每个元素连接成一个长字符串,元素之间用空格分隔
ai = ' '.join(ai_danmu)
# 使用jieba分词对连接后的字符串进行分词并将分词结果再次连接成一个长字符串分词之间用空格分隔
cut_t = ' '.join(jieba.cut(ai))
# 设置停用词,减少无效信息
stop_words = set()
content = [line.strip() for line in open('停用词.txt', 'r', encoding='utf-8').readlines()]
stop_words.update(content)
# 创建一个WordCloud对象设置词云图的参数
wordcloud = WordCloud(background_color="white", # 设置背景颜色为白色
width=800, # 设置词云图的宽度
height=600, # 设置词云图的高度
max_words=150, # 设置词云图中显示的最大单词数
max_font_size=80, # 设置词云图中最大字号
font_path='msyh.ttc', # 设置字体路径,确保能够显示中文
contour_width=2, # 设置词云图的轮廓宽度
contour_color='steelblue', # 设置词云图轮廓的颜色
repeat='true', # 设置单词课重复出现
stopwords = stop_words, # 设置停用词
).generate(cut_t)
# 将生成的词云图保存为PNG文件
wordcloud.to_file('词云图.png')
if __name__ == '__main__':
bvlist = get_bvlist()
cidlist = get_cidlist(bvlist)
get_danmu(cidlist)
ai_danmu = get_ai_related_danmu()
get_wordcloud(ai_danmu)