You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

123 lines
4.3 KiB

import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import os
cnt = 0
# 已爬取视频数
danmuku_all = []
# 弹幕库
headers = {
"cookie": "cookie",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
}
def get_cid(bvid):
url = f"https://api.bilibili.com/x/player/pagelist?bvid={bvid}"
try:
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
Json = response.json()
return Json['data'][0]['cid']
except requests.exceptions.RequestException as e:
print(f"请求失败: {e}")
return None
def get_danmuku(cid):
if cid is None:
return []
url = f"https://comment.bilibili.com/{cid}.xml"
try:
response = requests.get(url, headers=headers, timeout=10)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'xml')
return [i.text for i in soup.find_all('d')]
except requests.exceptions.RequestException as e:
print(f"请求失败: {e}")
return []
for Page in range(1, 22): # 1到22页够300个视频
url = f'https://api.bilibili.com/x/web-interface/search/type?search_type=video&keyword=巴黎奥运会&page={Page}'
try:
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
Json = response.json()
results = Json['data']['result']
for result in results:
cid = get_cid(result['bvid'])
danmuku = get_danmuku(cid)
danmuku_all.extend(danmuku)
cnt += 1
if cnt >= 300:
break
if cnt >= 300:
break
except requests.exceptions.RequestException as e:
print(f"请求失败: {e}")
time.sleep(1) # 延时1秒防止被屏蔽
def filter_danmuku(danmuku_list, keywords):
keywords_lower = [keyword.lower() for keyword in keywords] # 关键词小写
filtered = [d for d in danmuku_list if any(keyword in d.lower() for keyword in keywords_lower)]
return filtered
# 读取弹幕文件
with open('所有视频弹幕.txt', 'r', encoding='utf-8') as file:
danmuku_all = file.readlines()
# 筛选包含关键词的弹幕
keywords = ['AI配音' , 'ai配音' , '人工智能' , 'ai画图' , 'AI画图' , 'AI识曲' , 'AI生成' , '神经网络' , '卷积神经网络' , '循环神经网络' , '智能家居' , '自动驾驶' , '智能推荐' , '智能算法' , '强化学习' , '计算机视觉' , 'ai还原' , 'ai合成']
filtered_danmuku = filter_danmuku(danmuku_all, keywords)
# 统计弹幕数量
counter = Counter(filtered_danmuku)
most_common = counter.most_common(8)
# 输出前八位弹幕
print("前8位弹幕及其数量:")
for content, count in most_common:
print(f"弹幕: {content.strip()} - 数量: {count}")
# 将结果按列写入Excel
data = {'弹幕内容': [content.strip() for content, count in most_common],
'数量': [count for content, count in most_common]}
df = pd.DataFrame(data)
df.to_excel('AI_人工智能_弹幕统计.xlsx', index=False)
print("前8位弹幕统计已保存到 'AI_人工智能_弹幕统计.xlsx'.")
font_path = r'C:\Windows\Fonts\simhei.ttf'
text = ""
# 提取所有的 .txt 文件中的数据
#
txt_folder = r'D:\tongfu.net\projects\python\test' # 替换为包含 .txt 文件的文件夹路径
try:
for filename in os.listdir(txt_folder):
if filename.endswith('.txt'):
with open(os.path.join(txt_folder, filename), 'r', encoding='utf-8') as file:
text += file.read() + ' '
if not text:
raise ValueError("没有从 .txt 文件中提取到数据")
wordcloud = WordCloud(font_path=font_path, width=800, height=400, background_color='white').generate(text)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()
wordcloud.to_file('词云图.png')
except FileNotFoundError:
print("文件未找到,请检查文件夹路径")
except ValueError as ve:
print(ve)
except Exception as e:
print(f"发生错误: {e}")