parent
c8a122faed
commit
ad1dbd0191
@ -0,0 +1,94 @@
|
|||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import time
|
||||||
|
import pandas as pd
|
||||||
|
from collections import Counter
|
||||||
|
from wordcloud import WordCloud
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
cnt = 0 #已爬取视频数
|
||||||
|
danmuku_all = [] #弹幕库
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
user-agent Mozilla5.0 (Windows NT 10.0; Win64; x64) AppleWebKit537.36 (KHTML, like Gecko) Chrome128.0.0.0 Safari537.36
|
||||||
|
}
|
||||||
|
def get_cid(bvid)
|
||||||
|
url = fhttpsapi.bilibili.comxplayerpagelistbvid={bvid}
|
||||||
|
try
|
||||||
|
response = requests.get(url, headers=headers, timeout=10)
|
||||||
|
response.raise_for_status()
|
||||||
|
Json = response.json()
|
||||||
|
return Json['data'][0]['cid']
|
||||||
|
except requests.exceptions.RequestException as e
|
||||||
|
print(f请求失败 {e})
|
||||||
|
return None
|
||||||
|
def get_danmuku(cid)
|
||||||
|
if cid is None
|
||||||
|
return []
|
||||||
|
url = fhttpscomment.bilibili.com{cid}.xml
|
||||||
|
try
|
||||||
|
response = requests.get(url, headers=headers, timeout=10)
|
||||||
|
response.encoding = 'utf-8'
|
||||||
|
soup = BeautifulSoup(response.text, 'xml')
|
||||||
|
return [i.text for i in soup.find_all('d')]
|
||||||
|
except requests.exceptions.RequestException as e
|
||||||
|
print(f请求失败 {e})
|
||||||
|
return []
|
||||||
|
for Page in range(1, 22)
|
||||||
|
url = f'httpsapi.bilibili.comxweb-interfacesearchtypesearch_type=video&keyword=巴黎奥运会&page={Page}'
|
||||||
|
try
|
||||||
|
response = requests.get(url, headers=headers, timeout=10)
|
||||||
|
response.raise_for_status()
|
||||||
|
Json = response.json()
|
||||||
|
results = Json['data']['result']
|
||||||
|
for result in results
|
||||||
|
cid = get_cid(result['bvid'])
|
||||||
|
danmuku = get_danmuku(cid)
|
||||||
|
danmuku_all.extend(danmuku)
|
||||||
|
cnt += 1
|
||||||
|
if cnt = 300
|
||||||
|
break
|
||||||
|
if cnt = 300
|
||||||
|
break
|
||||||
|
except requests.exceptions.RequestException as e
|
||||||
|
print(f请求失败 {e})
|
||||||
|
time.sleep(1) #延时1秒防止被屏蔽
|
||||||
|
def filter_danmuku(danmuku_list, keywords)
|
||||||
|
#筛选包含指定关键词的弹幕
|
||||||
|
keywords_lower = [keyword.lower() for keyword in keywords] # 关键词小写
|
||||||
|
filtered = [d for d in danmuku_list if any(keyword in d.lower() for keyword in keywords_lower)]
|
||||||
|
return filtered
|
||||||
|
|
||||||
|
#读取弹幕文件
|
||||||
|
with open('所有视频弹幕.txt', 'r', encoding='utf-8') as file
|
||||||
|
danmuku_all = file.readlines()
|
||||||
|
|
||||||
|
#筛选包含关键词的弹幕
|
||||||
|
keywords = ['AI识曲','AI生成','神经网络','卷积神经网络','循环神经网络','智能家居','自动驾驶','智能推荐','智能算法','强化学习','计算机视觉','ai还原','ai合成']
|
||||||
|
filtered_danmuku = filter_danmuku(danmuku_all, keywords)
|
||||||
|
#统计弹幕数量
|
||||||
|
counter = Counter(filtered_danmuku)
|
||||||
|
most_common = counter.most_common(8)
|
||||||
|
#将结果按列写入Excel
|
||||||
|
data = {'弹幕内容' [content.strip() for content, count in most_common],
|
||||||
|
'数量' [count for content, count in most_common]}
|
||||||
|
df = pd.DataFrame(data)
|
||||||
|
df.to_excel('AI_人工智能_弹幕统计.xlsx', index=False)
|
||||||
|
print(前8位弹幕统计已保存到 'AI_人工智能_弹幕统计.xlsx'.)
|
||||||
|
font_path = r'CWindowsFontssimhei.ttf'
|
||||||
|
try
|
||||||
|
df = pd.read_excel('AI_人工智能_弹幕统计.xlsx')
|
||||||
|
if '弹幕内容' not in df.columns
|
||||||
|
raise ValueError(Excel 文件中没有找到 '弹幕内容' 列)
|
||||||
|
text = ' '.join(df['弹幕内容'].dropna())
|
||||||
|
wordcloud = WordCloud(font_path=font_path, width=800, height=400, background_color='white').generate(text)
|
||||||
|
plt.figure(figsize=(10, 5))
|
||||||
|
plt.imshow(wordcloud, interpolation='bilinear')
|
||||||
|
plt.axis('off')
|
||||||
|
plt.show()
|
||||||
|
wordcloud.to_file('词云图.png')
|
||||||
|
except FileNotFoundError
|
||||||
|
print(文件未找到,请检查文件路径)
|
||||||
|
except ValueError as ve
|
||||||
|
print(ve)
|
||||||
|
except Exception as e
|
||||||
|
print(f发生错误 {e})
|
Loading…
Reference in new issue