You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

105 lines
3.9 KiB

import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
cnt = 0
# 已爬取视频数
danmuku_all = []
# 弹幕库
headers = {
"cookie": "cookie",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
}
def get_cid(bvid):
url = f"https://api.bilibili.com/x/player/pagelist?bvid={bvid}"
try:
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
Json = response.json()
return Json['data'][0]['cid']
except requests.exceptions.RequestException as e:
print(f"请求失败: {e}")
return None
def get_danmuku(cid):
if cid is None:
return []
url = f"https://comment.bilibili.com/{cid}.xml"
try:
response = requests.get(url, headers=headers, timeout=10)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'xml')
return [i.text for i in soup.find_all('d')]
except requests.exceptions.RequestException as e:
print(f"请求失败: {e}")
return []
for Page in range(1, 22): # 1到22页够300个视频
url = f'https://api.bilibili.com/x/web-interface/search/type?search_type=video&keyword=巴黎奥运会&page={Page}'
try:
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
Json = response.json()
results = Json['data']['result']
for result in results:
cid = get_cid(result['bvid'])
danmuku = get_danmuku(cid)
danmuku_all.extend(danmuku)
cnt += 1
if cnt >= 300:
break
if cnt >= 300:
break
except requests.exceptions.RequestException as e:
print(f"请求失败: {e}")
time.sleep(1) # 延时1秒防止被屏蔽
def filter_danmuku(danmuku_list, keywords):
# 筛选包含指定关键词的弹幕
keywords_lower = [keyword.lower() for keyword in keywords] # 关键词小写
filtered = [d for d in danmuku_list if any(keyword in d.lower() for keyword in keywords_lower)]
return filtered
# 读取弹幕文件
with open('所有视频弹幕.txt', 'r', encoding='utf-8') as file:
danmuku_all = file.readlines()
# 筛选包含关键词的弹幕
keywords = ['AI配音' , 'ai配音' , '人工智能' , 'ai画图' , 'AI画图' , 'AI识曲' , 'AI生成' , '神经网络' , '卷积神经网络' , '循环神经网络' , '智能家居' , '自动驾驶' , '智能推荐' , '智能算法' , '强化学习' , '计算机视觉' , 'ai还原' , 'ai合成']
filtered_danmuku = filter_danmuku(danmuku_all, keywords)
# 统计弹幕数量
counter = Counter(filtered_danmuku)
most_common = counter.most_common(8)
# 将结果按列写入Excel
data = {'弹幕内容': [content.strip() for content, count in most_common],
'数量': [count for content, count in most_common]}
df = pd.DataFrame(data)
df.to_excel('AI_人工智能_弹幕统计.xlsx', index=False)
print("前8位弹幕统计已保存到 'AI_人工智能_弹幕统计.xlsx'.")
font_path = r'C:\Windows\Fonts\simhei.ttf'
try:
df = pd.read_excel('AI_人工智能_弹幕统计.xlsx')
if '弹幕内容' not in df.columns:
raise ValueError("Excel 文件中没有找到 '弹幕内容'")
text = ' '.join(df['弹幕内容'].dropna())
wordcloud = WordCloud(font_path=font_path, width=800, height=400, background_color='white').generate(text)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()
wordcloud.to_file('词云图.png')
except FileNotFoundError:
print("文件未找到,请检查文件路径")
except ValueError as ve:
print(ve)
except Exception as e:
print(f"发生错误: {e}")