You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
123 lines
4.3 KiB
123 lines
4.3 KiB
import requests
|
|
from bs4 import BeautifulSoup
|
|
import time
|
|
import pandas as pd
|
|
from collections import Counter
|
|
from wordcloud import WordCloud
|
|
import matplotlib.pyplot as plt
|
|
import os
|
|
cnt = 0
|
|
# 已爬取视频数
|
|
danmuku_all = []
|
|
# 弹幕库
|
|
|
|
headers = {
|
|
"cookie": "cookie",
|
|
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
|
|
}
|
|
|
|
|
|
def get_cid(bvid):
|
|
url = f"https://api.bilibili.com/x/player/pagelist?bvid={bvid}"
|
|
try:
|
|
response = requests.get(url, headers=headers, timeout=10)
|
|
response.raise_for_status()
|
|
Json = response.json()
|
|
return Json['data'][0]['cid']
|
|
except requests.exceptions.RequestException as e:
|
|
print(f"请求失败: {e}")
|
|
return None
|
|
|
|
|
|
def get_danmuku(cid):
|
|
if cid is None:
|
|
return []
|
|
url = f"https://comment.bilibili.com/{cid}.xml"
|
|
try:
|
|
response = requests.get(url, headers=headers, timeout=10)
|
|
response.encoding = 'utf-8'
|
|
soup = BeautifulSoup(response.text, 'xml')
|
|
return [i.text for i in soup.find_all('d')]
|
|
except requests.exceptions.RequestException as e:
|
|
print(f"请求失败: {e}")
|
|
return []
|
|
|
|
|
|
for Page in range(1, 22): # 1到22页够300个视频
|
|
url = f'https://api.bilibili.com/x/web-interface/search/type?search_type=video&keyword=巴黎奥运会&page={Page}'
|
|
try:
|
|
response = requests.get(url, headers=headers, timeout=10)
|
|
response.raise_for_status()
|
|
Json = response.json()
|
|
results = Json['data']['result']
|
|
for result in results:
|
|
cid = get_cid(result['bvid'])
|
|
danmuku = get_danmuku(cid)
|
|
danmuku_all.extend(danmuku)
|
|
cnt += 1
|
|
if cnt >= 300:
|
|
break
|
|
if cnt >= 300:
|
|
break
|
|
except requests.exceptions.RequestException as e:
|
|
print(f"请求失败: {e}")
|
|
time.sleep(1) # 延时1秒防止被屏蔽
|
|
|
|
|
|
def filter_danmuku(danmuku_list, keywords):
|
|
keywords_lower = [keyword.lower() for keyword in keywords] # 关键词小写
|
|
filtered = [d for d in danmuku_list if any(keyword in d.lower() for keyword in keywords_lower)]
|
|
return filtered
|
|
|
|
# 读取弹幕文件
|
|
with open('所有视频弹幕.txt', 'r', encoding='utf-8') as file:
|
|
danmuku_all = file.readlines()
|
|
|
|
# 筛选包含关键词的弹幕
|
|
keywords = ['AI配音' , 'ai配音' , '人工智能' , 'ai画图' , 'AI画图' , 'AI识曲' , 'AI生成' , '神经网络' , '卷积神经网络' , '循环神经网络' , '智能家居' , '自动驾驶' , '智能推荐' , '智能算法' , '强化学习' , '计算机视觉' , 'ai还原' , 'ai合成']
|
|
filtered_danmuku = filter_danmuku(danmuku_all, keywords)
|
|
|
|
# 统计弹幕数量
|
|
counter = Counter(filtered_danmuku)
|
|
most_common = counter.most_common(8)
|
|
|
|
# 输出前八位弹幕
|
|
print("前8位弹幕及其数量:")
|
|
for content, count in most_common:
|
|
print(f"弹幕: {content.strip()} - 数量: {count}")
|
|
|
|
# 将结果按列写入Excel
|
|
data = {'弹幕内容': [content.strip() for content, count in most_common],
|
|
'数量': [count for content, count in most_common]}
|
|
df = pd.DataFrame(data)
|
|
df.to_excel('AI_人工智能_弹幕统计.xlsx', index=False)
|
|
print("前8位弹幕统计已保存到 'AI_人工智能_弹幕统计.xlsx'.")
|
|
font_path = r'C:\Windows\Fonts\simhei.ttf'
|
|
text = ""
|
|
|
|
# 提取所有的 .txt 文件中的数据
|
|
#
|
|
txt_folder = r'D:\tongfu.net\projects\python\test' # 替换为包含 .txt 文件的文件夹路径
|
|
|
|
try:
|
|
for filename in os.listdir(txt_folder):
|
|
if filename.endswith('.txt'):
|
|
with open(os.path.join(txt_folder, filename), 'r', encoding='utf-8') as file:
|
|
text += file.read() + ' '
|
|
|
|
if not text:
|
|
raise ValueError("没有从 .txt 文件中提取到数据")
|
|
|
|
wordcloud = WordCloud(font_path=font_path, width=800, height=400, background_color='white').generate(text)
|
|
plt.figure(figsize=(10, 5))
|
|
plt.imshow(wordcloud, interpolation='bilinear')
|
|
plt.axis('off')
|
|
plt.show()
|
|
wordcloud.to_file('词云图.png')
|
|
|
|
except FileNotFoundError:
|
|
print("文件未找到,请检查文件夹路径")
|
|
except ValueError as ve:
|
|
print(ve)
|
|
except Exception as e:
|
|
print(f"发生错误: {e}") |