You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

69 lines
2.4 KiB

import requests
from bs4 import BeautifulSoup
import pandas as pd
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt
# 模拟浏览器请求头
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/ Safari/537.36 Edg/"
# 检查文本是否包含“AI”或“人工智能”
def contains_ai_or_artificial_intelligence(text):
return "ai" in text.lower() or "人工智能" in text.lower()
# 获取网页内容
def get_html(url):
response = requests.get(url, headers=headers)
response.encoding = 'utf-8'
return response.text
# 解析网页获取视频链接
def parse_video_links(html):
soup = BeautifulSoup(html, 'html.parser')
video_links = []
for link in soup.find_all('a', href=True):
href = link['href']
if href.startswith('/video/'):
return video_links
# 获取弹幕数据
def get_danmaku_data(video_id):
danmaku_url = f"{video_id}"
response = requests.get(danmaku_url, headers=headers)
if response.status_code == 200:
danmaku_data = response.json()
danmakus = [danmaku['content'] for danmaku in danmaku_data['data']['list']]
return danmakus
return []
# 主程序
def main():
ai_danmaku_list = [] # 初始化列表以存储AI相关的弹幕
video_links = parse_video_links(get_html(""))
for link in video_links:
video_id = link.split('/')[2]
danmakus = get_danmaku_data(video_id)
for danmaku in danmakus:
if contains_ai_or_artificial_intelligence(danmaku):
# 输出AI相关的弹幕数量
print(f"AI相关的弹幕数量: {len(ai_danmaku_list)}")
# 写入Excel文件
df = pd.DataFrame(ai_danmaku_list, columns=['弹幕'])
df.to_excel('ai_danmaku.xlsx', index=False)
# 生成词云图
text = ' '.join(ai_danmaku_list)
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
plt.imshow(wordcloud, interpolation='bilinear')
if __name__ == "__main__":