main
starhun 2 years ago
commit 0887703d59

44
ai.py

@ -0,0 +1,44 @@
import pandas as pd
from collections import Counter
import re
# 过滤出包含AI技术相关词汇的弹幕
def filter_ai_related_danmakus(danmakus):
ai_keywords = [
'AI', '人工智能', '机器学习', '深度学习', '算法', '数据分析',
'自动化', '智能识别', '智能', '机器人',
'视觉识别', '语音识别', '图像处理', '大数据', '自适应', '智能系统',
'智能决策', '神经网络', '自动驾驶', '虚拟现实', '增强现实', '模式识别',
'自然语言处理', '推荐系统', '智能助手', '深度神经网络', '智能合约',
'量化分析', '特征工程', '云计算', '边缘计算', '人脸识别', '手势识别'
]
filtered_danmakus = []
for danmaku in danmakus:
if any(re.search(keyword, danmaku, re.IGNORECASE) for keyword in ai_keywords):
filtered_danmakus.append(danmaku)
return filtered_danmakus
# 统计弹幕出现频次
def count_danmakus(danmakus):
danmaku_counter = Counter(danmakus)
return danmaku_counter.most_common(8) # 获取前8项
# 主函数
def main():
# 读取之前保存的弹幕数据
df = pd.read_csv('danmaku.csv')
danmakus = df['danmaku'].tolist()
# 筛选出与AI技术相关的弹幕
ai_related_danmakus = filter_ai_related_danmakus(danmakus)
# 统计并获取数量排名前8的弹幕
top_8_danmakus = count_danmakus(ai_related_danmakus)
# 输出结果
print("与本次赛事应用AI技术相关的前8条弹幕及其出现次数")
for danmaku, count in top_8_danmakus:
print(f"{danmaku}: {count}")
if __name__ == "__main__":
main()

@ -0,0 +1,19 @@
import pandas as pd
from wordcloud import WordCloud
import matplotlib.pyplot as plt
# 读取包含AI相关弹幕的CSV文件
file_path = 'danmaku.csv'
df = pd.read_csv(file_path)
# 将弹幕内容连接成一个长字符串,用于词云生成
text = ' '.join(df['danmaku'].dropna().astype(str))
# 生成词云
wordcloud = WordCloud(width=800, height=400, background_color='white', font_path='/usr/share/fonts/truetype/simhei.ttf').generate(text)
# 绘制词云图
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -0,0 +1,104 @@
from concurrent.futures import ThreadPoolExecutor, as_completed
import requests
import pandas as pd
import time
import re
from bs4 import BeautifulSoup
# 搜索B站视频
def search_bilibili_videos(keyword, pages=1):
videos = []
search_url = "https://api.bilibili.com/x/web-interface/search/all/v2"
for page in range(1, pages + 1):
params = {
'keyword': keyword,
'page': page,
'order': 'toalrank' # 综合排序
}
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:129.0) Gecko/20100101 Firefox/129.0',
# "Referer": "https://www.bilibili.com",
'cookie': 'buvid3=71DE7BF1-65B2-E9EF-A649-24AAA66F6C7678761infoc; b_nut=1725457378; _uuid=9D6B1761-AF8E-C361-377A-C98C108473410589523infoc; enable_web_push=DISABLE; home_feed_column=5; browser_resolution=1536-256; buvid_fp=4bcf0d8c44fc914c4c5fd2ed0dae4f4b; buvid4=5DF4D6D0-2AE4-BF08-68E4-9D5F0577449779973-024090413-QskMrVA3JCskGyxu8QjrHA%3D%3D; CURRENT_FNVAL=4048; rpdid=|(u))kkYu|R~0J\'u~klRl~J)R; SESSDATA=861f18ef%2C1741438809%2Cea6ab%2A91CjBoRWoAds6HeZgtds_Bgccg6ZxZDoepB94Evl7kVfLMtwSKMU2X2lFH5ZfOQAqlsCESVmlINjNpV050NnFucWlibklobHhOYlJlLW9laEtyRnd5aElhVzZMM1NXdnhxT1lzMHB5WUNvcnpOcjVuUDZhMWhhUmhQdU5rQzJaQlZ0RXN5dXMtb3RnIIEC; bili_jct=0bf03509e683bc052db716bbaf2463f1; DedeUserID=1517012437; DedeUserID__ckMd5=20fba854b81fb2cd; b_lsid=73C810398_191D6DEBF89; bsource=search_baidu; header_theme_version=CLOSE; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjYxNDYwMDgsImlhdCI6MTcyNTg4Njc0OCwicGx0IjotMX0.JZKSF_Wk7iJkslm76AUNtIij5YF8FPsezF14FG1KTbk; bili_ticket_expires=1726145948; sid=e6ubibh7'
}
response = requests.get(search_url, params=params, headers=headers)
if response.status_code == 200:
data = response.json()
for result in data['data']['result']:
if result['result_type'] == 'video':
for video in result['data']:
videos.append({
'title': video['title'],
'aid': video['aid'],
'bvid': video['bvid'],
'danmaku': video['danmaku'],
'url': video['arcurl']
})
time.sleep(1) # 防止过快请求被封
return videos
def get_cid(aid):
video_url = f"https://api.bilibili.com/x/player/pagelist?aid={aid}&jsonp=jsonp"
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:129.0) Gecko/20100101 Firefox/129.0'}
response = requests.get(video_url, headers=headers)
if response.status_code == 200:
data = response.json()
return data['data'][0]['cid']
return None
# 获取弹幕
def get_danmaku(cid):
danmaku_url = f"https://api.bilibili.com/x/v1/dm/list.so?oid={cid}"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:129.0) Gecko/20100101 Firefox/129.0'}
response = requests.get(danmaku_url, headers=headers)
response.encoding = 'utf-8'
danmakus = []
obj=re.compile(r'<d p="(.*?)">(?P<danmaku>.*?)</d>', re.S)
if response.status_code == 200:
ds=obj.finditer(response.text)
for d in ds:
danmakus.append(d.group("danmaku"))
# soup = BeautifulSoup(response.text, 'lxml-xml')
# danmakus = [d.text for d in soup.find_all('d')]
return danmakus
return []
def process_video(video):
cid = get_cid(video['aid'])
if cid:
danmakus = get_danmaku(cid)
print(f"获取视频 {video['title']}{len(danmakus)} 条弹幕")
return danmakus
else:
print(f"无法获取视频 {video['title']} 的 cid")
return []
# 主函数
def main():
keyword = "2024巴黎奥运会"
pages = 10 # 每页默认返回30个视频10页为300个视频
videos = search_bilibili_videos(keyword, pages=pages)
all_danmakus = []
with ThreadPoolExecutor(max_workers=10) as executor:
futures = [executor.submit(process_video, video) for video in videos]
for future in as_completed(futures):
danmakus = future.result()
all_danmakus.extend(danmakus)
with open('danmaku.xlsx', 'w', encoding='utf-8') as f:
for danmaku in all_danmakus:
f.write(danmaku + '\n')
# 保存为CSV文件
df = pd.DataFrame({'danmaku': all_danmakus})
df.to_csv('danmaku.csv', index=False, encoding='utf-8-sig')
print(f"共收集到 {len(all_danmakus)} 条弹幕,保存至 danmaku.csv")
if __name__ == "__main__":
main()
Loading…
Cancel
Save