You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

161 lines
5.6 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import requests
import time
from bs4 import BeautifulSoup
from collections import Counter
import openpyxl
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import jieba
import copy
import re
header = {
'Cookie': "buvid4=45060204-E173-B80C-04C7-11B8D43E872D75556-022101721-m4T90WVXeagV7lXz92ZWuw%3D%3D; DedeUserID=3461571130952367; DedeUserID__ckMd5=b3ea366f0ed87c94; CURRENT_FNVAL=4048; enable_web_push=DISABLE; _uuid=FFC241066-10489-E63A-8A4B-6C217510EACC571042infoc; buvid3=E0F932DB-25F3-EF2A-2629-B756E279809355623infoc; b_nut=1700799855; header_theme_version=CLOSE; rpdid=|(u))kkYu|JJ0J'u~|R~)~|Yu; PVID=1; FEED_LIVE_VERSION=V8; buvid_fp_plain=undefined; CURRENT_QUALITY=0; fingerprint=450dce408658a7f873061345351176c7; buvid_fp=77344ae6cddf7b1d32d0c69bd9a38f80; b_lsid=4D7FCA4C_191DC589C5D; SESSDATA=8840b176%2C1741530686%2C6d032%2A92CjC46834ghJkHd4YjzNOtLiSAgKyx6hzATr7eWbatDJxB8pR708adiIvgr2OqCWEFkASVmluRUljMDZPQWlEMUx2MlVhQkJoanRvR1kxNlZ2M2hjS0hPekFrWGlNZFc2ZWNiWmdrQ0tyaWlkODFyNUp2WDdqc3FSbjUzUEhjNlFBc0U0UGR4a3d3IIEC; bili_jct=80e99b3d5026f86143f072c7e055fed8; sid=gbw623md; bp_t_offset_3461571130952367=975560108564021248; home_feed_column=4; browser_resolution=895-822",
'user-agent': '12345678zxdcvfbnmdfcsdxaghdnhguiasgck'
# "Mozilla/5.0 (111Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0
# Safari/537.36 Edg/128.0.0.0"
}
# 获取视频的aid列表:
def get_video_aid_list(keyword):
aid_set = set()
aid_list = []
page = 1
while len(aid_set) < 300:
url = f"https://api.bilibili.com/x/web-interface/search/all/v2?keyword={keyword}&page={page}"
response = requests.get(url, headers=header)
data = response.json()
for item in data['data']['result']:
if item['result_type'] == 'video':
for video in item['data']:
if video['aid'] not in aid_set:
aid_list.append({
'title': video['title'],
'aid': video['aid']
})
aid_set.add(video['aid'])
page += 1
time.sleep(0.15)
return aid_list
# 将aid转换成cid:
def get_cid_by_aid(aid):
url = f"https://api.bilibili.com/x/web-interface/view?aid={aid}"
response = requests.get(url, headers=header)
# print(response.text)
data = response.json()
return data['data']['cid']
# 爬取视频的弹幕数据:
def get_danmaku(cid):
url = f"https://comment.bilibili.com/{cid}.xml"
response = requests.get(url, headers=header)
# print(response.text)
soup = BeautifulSoup(response.content, "xml")
danmakus = soup.find_all('d')
return [d.text for d in danmakus]
def crawl_danmaku_data(keyword):
videos = get_video_aid_list(keyword)
all_danmakus = []
for video in videos:
aid = video['aid']
title = video['title']
# print(aid)
cid = get_cid_by_aid(aid)
print(f"正在爬取弹幕: {title} (AID: {aid}, CID: {cid})")
try:
danmakus = get_danmaku(cid)
all_danmakus.extend(danmakus)
# time.sleep(1)
except Exception as e:
print(f"获取视频 {title} 的弹幕时出错: {e}")
wb = openpyxl.Workbook()
sheet = wb.active
sheet.title = "Danmakus"
sheet.append(["弹幕内容"])
for item in all_danmakus:
sheet.append([item])
wb.save('danmakus.xlsx')
# print(all_danmakus)
# all_danmakus = all_danmakus[:-1]
return all_danmakus
ai_keywords = ["ai", "人工智能", "智能", "机器学习", "深度学习", "神经网络", ]
def contains_pattern(text):
text1 = text.lower()
# pattern = r'|'.join(re.escape(keyword) for keyword in ai_keywords)
pattern = r'(?<![a-zA-Z])(?:' + '|'.join(re.escape(keyword) for keyword in ai_keywords) + r')(?![a-zA-Z])'
if re.search(pattern, text1):
return True
else:
return False
def ai_related_danmakus(danmakus):
ai_related_danmakus = []
for danmaku in danmakus:
if contains_pattern(danmaku): # 忽略大小写
ai_related_danmakus.append(danmaku)
# if contains_pattern(danmaku):
# print(danmaku)
# ai_related_danmakus.append(danmaku)
# print(f"AI 相关弹幕: {ai_related_danmakus}")
print(ai_related_danmakus)
# return word_counts.most_common(8)
return ai_related_danmakus
# 将统计数据写入Excel:
def write_to_excel(data, file_name='ai_related_danmakus.xlsx'):
wb = openpyxl.Workbook()
sheet = wb.active
sheet.title = "AI Related Danmakus"
sheet.append(["弹幕内容", "出现次数"])
for item in data:
sheet.append([item[0], item[1]])
wb.save(file_name)
# 生成词云:
def generate_wordcloud(data):
wordcloud_text = ' '.join([item for item in data]) # (data)
wordcloud = WordCloud(font_path='msyh.ttc', width=800, height=400, background_color='white').generate(
wordcloud_text)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()
# 主函数:
if __name__ == "__main__":
keyword = "2024巴黎奥运会"
danmakus = crawl_danmaku_data(keyword)
print(f"共获取到 {len(danmakus)} 条弹幕数据。")
ai_anmakus = ai_related_danmakus(danmakus)
top_8_danmakus = Counter(ai_anmakus).most_common(8)
print("AI相关弹幕排名前8的数据")
print(top_8_danmakus)
write_to_excel(top_8_danmakus)
generate_wordcloud(ai_anmakus)