|
|
import requests
|
|
|
import time
|
|
|
from bs4 import BeautifulSoup
|
|
|
from collections import Counter
|
|
|
import openpyxl
|
|
|
import matplotlib.pyplot as plt
|
|
|
from wordcloud import WordCloud
|
|
|
import jieba
|
|
|
import copy
|
|
|
import re
|
|
|
|
|
|
header = {
|
|
|
'Cookie': "buvid4=45060204-E173-B80C-04C7-11B8D43E872D75556-022101721-m4T90WVXeagV7lXz92ZWuw%3D%3D; DedeUserID=3461571130952367; DedeUserID__ckMd5=b3ea366f0ed87c94; CURRENT_FNVAL=4048; enable_web_push=DISABLE; _uuid=FFC241066-10489-E63A-8A4B-6C217510EACC571042infoc; buvid3=E0F932DB-25F3-EF2A-2629-B756E279809355623infoc; b_nut=1700799855; header_theme_version=CLOSE; rpdid=|(u))kkYu|JJ0J'u~|R~)~|Yu; PVID=1; FEED_LIVE_VERSION=V8; buvid_fp_plain=undefined; CURRENT_QUALITY=0; fingerprint=450dce408658a7f873061345351176c7; buvid_fp=77344ae6cddf7b1d32d0c69bd9a38f80; b_lsid=4D7FCA4C_191DC589C5D; SESSDATA=8840b176%2C1741530686%2C6d032%2A92CjC46834ghJkHd4YjzNOtLiSAgKyx6hzATr7eWbatDJxB8pR708adiIvgr2OqCWEFkASVmluRUljMDZPQWlEMUx2MlVhQkJoanRvR1kxNlZ2M2hjS0hPekFrWGlNZFc2ZWNiWmdrQ0tyaWlkODFyNUp2WDdqc3FSbjUzUEhjNlFBc0U0UGR4a3d3IIEC; bili_jct=80e99b3d5026f86143f072c7e055fed8; sid=gbw623md; bp_t_offset_3461571130952367=975560108564021248; home_feed_column=4; browser_resolution=895-822",
|
|
|
'user-agent': '12345678zxdcvfbnmdfcsdxaghdnhguiasgck'
|
|
|
# "Mozilla/5.0 (111Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0
|
|
|
# Safari/537.36 Edg/128.0.0.0"
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
# 获取视频的aid列表:
|
|
|
def get_video_aid_list(keyword):
|
|
|
aid_set = set()
|
|
|
aid_list = []
|
|
|
page = 1
|
|
|
while len(aid_set) < 300:
|
|
|
url = f"https://api.bilibili.com/x/web-interface/search/all/v2?keyword={keyword}&page={page}"
|
|
|
response = requests.get(url, headers=header)
|
|
|
data = response.json()
|
|
|
|
|
|
for item in data['data']['result']:
|
|
|
if item['result_type'] == 'video':
|
|
|
for video in item['data']:
|
|
|
if video['aid'] not in aid_set:
|
|
|
aid_list.append({
|
|
|
'title': video['title'],
|
|
|
'aid': video['aid']
|
|
|
})
|
|
|
aid_set.add(video['aid'])
|
|
|
page += 1
|
|
|
time.sleep(0.15)
|
|
|
return aid_list
|
|
|
|
|
|
|
|
|
# 将aid转换成cid:
|
|
|
def get_cid_by_aid(aid):
|
|
|
url = f"https://api.bilibili.com/x/web-interface/view?aid={aid}"
|
|
|
response = requests.get(url, headers=header)
|
|
|
# print(response.text)
|
|
|
data = response.json()
|
|
|
return data['data']['cid']
|
|
|
|
|
|
|
|
|
# 爬取视频的弹幕数据:
|
|
|
def get_danmaku(cid):
|
|
|
url = f"https://comment.bilibili.com/{cid}.xml"
|
|
|
response = requests.get(url, headers=header)
|
|
|
# print(response.text)
|
|
|
soup = BeautifulSoup(response.content, "xml")
|
|
|
danmakus = soup.find_all('d')
|
|
|
return [d.text for d in danmakus]
|
|
|
|
|
|
|
|
|
def crawl_danmaku_data(keyword):
|
|
|
videos = get_video_aid_list(keyword)
|
|
|
all_danmakus = []
|
|
|
for video in videos:
|
|
|
aid = video['aid']
|
|
|
title = video['title']
|
|
|
# print(aid)
|
|
|
cid = get_cid_by_aid(aid)
|
|
|
print(f"正在爬取弹幕: {title} (AID: {aid}, CID: {cid})")
|
|
|
try:
|
|
|
danmakus = get_danmaku(cid)
|
|
|
all_danmakus.extend(danmakus)
|
|
|
# time.sleep(1)
|
|
|
except Exception as e:
|
|
|
print(f"获取视频 {title} 的弹幕时出错: {e}")
|
|
|
|
|
|
wb = openpyxl.Workbook()
|
|
|
sheet = wb.active
|
|
|
sheet.title = "Danmakus"
|
|
|
sheet.append(["弹幕内容"])
|
|
|
|
|
|
for item in all_danmakus:
|
|
|
sheet.append([item])
|
|
|
|
|
|
wb.save('danmakus.xlsx')
|
|
|
# print(all_danmakus)
|
|
|
# all_danmakus = all_danmakus[:-1]
|
|
|
|
|
|
return all_danmakus
|
|
|
|
|
|
|
|
|
ai_keywords = ["ai", "人工智能", "智能", "机器学习", "深度学习", "神经网络", ]
|
|
|
|
|
|
|
|
|
def contains_pattern(text):
|
|
|
text1 = text.lower()
|
|
|
|
|
|
# pattern = r'|'.join(re.escape(keyword) for keyword in ai_keywords)
|
|
|
pattern = r'(?<![a-zA-Z])(?:' + '|'.join(re.escape(keyword) for keyword in ai_keywords) + r')(?![a-zA-Z])'
|
|
|
if re.search(pattern, text1):
|
|
|
return True
|
|
|
else:
|
|
|
return False
|
|
|
|
|
|
|
|
|
def ai_related_danmakus(danmakus):
|
|
|
ai_related_danmakus = []
|
|
|
|
|
|
for danmaku in danmakus:
|
|
|
if contains_pattern(danmaku): # 忽略大小写
|
|
|
ai_related_danmakus.append(danmaku)
|
|
|
# if contains_pattern(danmaku):
|
|
|
# print(danmaku)
|
|
|
# ai_related_danmakus.append(danmaku)
|
|
|
# print(f"AI 相关弹幕: {ai_related_danmakus}")
|
|
|
print(ai_related_danmakus)
|
|
|
# return word_counts.most_common(8)
|
|
|
return ai_related_danmakus
|
|
|
|
|
|
|
|
|
# 将统计数据写入Excel:
|
|
|
def write_to_excel(data, file_name='ai_related_danmakus.xlsx'):
|
|
|
wb = openpyxl.Workbook()
|
|
|
sheet = wb.active
|
|
|
sheet.title = "AI Related Danmakus"
|
|
|
sheet.append(["弹幕内容", "出现次数"])
|
|
|
|
|
|
for item in data:
|
|
|
sheet.append([item[0], item[1]])
|
|
|
|
|
|
wb.save(file_name)
|
|
|
|
|
|
|
|
|
# 生成词云:
|
|
|
def generate_wordcloud(data):
|
|
|
wordcloud_text = ' '.join([item for item in data]) # (data)
|
|
|
wordcloud = WordCloud(font_path='msyh.ttc', width=800, height=400, background_color='white').generate(
|
|
|
wordcloud_text)
|
|
|
|
|
|
plt.figure(figsize=(10, 5))
|
|
|
plt.imshow(wordcloud, interpolation='bilinear')
|
|
|
plt.axis('off')
|
|
|
plt.show()
|
|
|
|
|
|
|
|
|
# 主函数:
|
|
|
if __name__ == "__main__":
|
|
|
keyword = "2024巴黎奥运会"
|
|
|
danmakus = crawl_danmaku_data(keyword)
|
|
|
print(f"共获取到 {len(danmakus)} 条弹幕数据。")
|
|
|
|
|
|
ai_anmakus = ai_related_danmakus(danmakus)
|
|
|
top_8_danmakus = Counter(ai_anmakus).most_common(8)
|
|
|
print("AI相关弹幕排名前8的数据:")
|
|
|
print(top_8_danmakus)
|
|
|
write_to_excel(top_8_danmakus)
|
|
|
generate_wordcloud(ai_anmakus)
|