You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
homework/102201216李俊辉.py

69 lines
2.4 KiB

import requests
from bs4 import BeautifulSoup
import pandas as pd
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt
# 模拟浏览器请求头
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0"
}
# 检查文本是否包含“AI”或“人工智能”
def contains_ai_or_artificial_intelligence(text):
return "ai" in text.lower() or "人工智能" in text.lower()
# 获取网页内容
def get_html(url):
response = requests.get(url, headers=headers)
response.encoding = 'utf-8'
return response.text
# 解析网页获取视频链接
def parse_video_links(html):
soup = BeautifulSoup(html, 'html.parser')
video_links = []
for link in soup.find_all('a', href=True):
href = link['href']
if href.startswith('/video/'):
video_links.append(href)
return video_links
# 获取弹幕数据
def get_danmaku_data(video_id):
danmaku_url = f"https://api.bilibili.com/x/v1/dm/list.so?oid={video_id}"
response = requests.get(danmaku_url, headers=headers)
if response.status_code == 200:
danmaku_data = response.json()
danmakus = [danmaku['content'] for danmaku in danmaku_data['data']['list']]
return danmakus
return []
# 主程序
def main():
ai_danmaku_list = [] # 初始化列表以存储AI相关的弹幕
video_links = parse_video_links(get_html("https://search.bilibili.com/video?keyword=%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A"))
for link in video_links:
video_id = link.split('/')[2]
danmakus = get_danmaku_data(video_id)
for danmaku in danmakus:
if contains_ai_or_artificial_intelligence(danmaku):
ai_danmaku_list.append(danmaku)
# 输出AI相关的弹幕数量
print(f"AI相关的弹幕数量: {len(ai_danmaku_list)}")
# 写入Excel文件
df = pd.DataFrame(ai_danmaku_list, columns=['弹幕'])
df.to_excel('ai_danmaku.xlsx', index=False)
# 生成词云图
text = ' '.join(ai_danmaku_list)
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()
if __name__ == "__main__":
main()