You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

76 lines
3.8 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from openpyxl import Workbook
def fetch_videos(keyword):
"""
根据关键词搜索视频并返回前300个视频的ID列表。
:param keyword: 搜索关键词
:return: 视频ID列表
"""
url = "https://search.bilibili.com/all"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}
params = {
'keyword': keyword,
'order': 'totalrank',
'page': '1'
}
try:
response = requests.get(url, headers=headers, params=params)
response.raise_for_status() # 抛出HTTP错误
soup = BeautifulSoup(response.text, 'html.parser')
video_ids = [a['href'].split('/')[-1] for a in soup.select('.video-item .title') if a.has_attr('href')]
print(f"Fetched {len(video_ids)} video IDs.")
return video_ids[:300]
except Exception as e:
print(f"Error fetching videos: {e}")
return []
def fetch_danmaku(video_id):
"""
获取指定视频的弹幕数据。
:param video_id: 视频ID
:return: 弹幕列表
"""
url = f"https://api.bilibili.com/x/v2/dm/web/seg.so?type=1&oid={video_id}"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
'Cookie': 'buvid3=0C047DB7-FB67-6565-B853-68B19196AEE053166infoc; buvid4=D2E32722-EB31-8B5B-8BC7-420F049CDE3657801-022071821-mG8+jYWtWHQ35A9yqIgZIA%3D%3D; buvid_fp=60e37bdf4fe67cde89d283db25adff46; _uuid=FCEA6C48-BB82-123A-61106-3F5410106BB410B03170infoc; b_nut=100; header_theme_version=CLOSE; enable_web_push=DISABLE; bmg_af_switch=1; bmg_src_def_domain=i0.hdslb.com; bsource=search_bing; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY5MTczNjUsImlhdCI6MTcyNjY1ODEwNSwicGx0IjotMX0.Aa-1_tfEk0rFqyzRFZ-vIsSUSbvUfyR7woQA3IC3h0s; bili_ticket_expires=1726917305; CURRENT_FNVAL=4048; SESSDATA=aa6a6590%2C1742210524%2C7b1c4%2A92CjCxud8rqp6tuF7AYkzmJF0YS7_L4_80iMI3NuY5q-M7BEW3cf0_bVyhIcnZMJapP7YSVnJiQ2NVcTJZZ1ZIMFduRURJXzZXOWtaTTl2WnBFSHkwckM0UzdwY2xHMG9MNVl4c1pUSHlFaFJ4RnQ5WjY3ZHRtcm5qcDhNSVo3eXZORDczc0VlYlF3IIEC; bili_jct=5232d057d308c18c1419d19271a3b85e; DedeUserID=1576579979; DedeUserID__ckMd5=da7d6054e70acbba; b_lsid=674C255B_19204FCE528; sid=g83mkv7a; home_feed_column=4; browser_resolution=435-748', # Cookie值
'Referer': f'https://www.bilibili.com/video/{video_id}',
}
try:
response = requests.get(url, headers=headers)
response.raise_for_status() # 抛出HTTP错误
data = response.json()
danmakus = [re.sub(r'<[^>]+>', '', dm['content']) for dm in data['data']['dm_seg_list']]
print(f"Fetched {len(danmakus)} danmakus for video {video_id}.")
return danmakus
except requests.RequestException as e:
print(f"Error fetching danmakus for video {video_id}: {e}")
return []
except KeyError as e:
print(f"Invalid response format for video {video_id}: {e}")
return []
def analyze_danmakus(danmakus):
"""
分析弹幕数据统计与AI技术应用相关的弹幕数量。
:param danmakus: 弹幕列表
:return: 弹幕数量的字典
"""
ai_keywords = ['AI', '人工智能', '机器学习', '深度学习', '算法', '大数据', '智能', '技术']
keyword_counts = {}
for danmaku in danmakus:
for keyword in ai_keywords:
if keyword in danmaku:
if danmaku in keyword_counts:
keyword_counts[danmaku] += 1
else:
keyword_counts[danmaku] = 1
return keyword_counts