You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

158 lines
5.6 KiB

import requests
import time
from bs4 import BeautifulSoup
from collections import Counter
import openpyxl
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import jieba
import copy
import re
header = {
'Cookie': "buvid4=45060204-E173-B80C-04C7-11B8D43E872D75556-022101721-m4T90WVXeagV7lXz92ZWuw%3D%3D; DedeUserID=3461571130952367; DedeUserID__ckMd5=b3ea366f0ed87c94; CURRENT_FNVAL=4048; enable_web_push=DISABLE; _uuid=FFC241066-10489-E63A-8A4B-6C217510EACC571042infoc; buvid3=E0F932DB-25F3-EF2A-2629-B756E279809355623infoc; b_nut=1700799855; header_theme_version=CLOSE; rpdid=|(u))kkYu|JJ0J'u~|R~)~|Yu; PVID=1; FEED_LIVE_VERSION=V8; buvid_fp_plain=undefined; CURRENT_QUALITY=0; fingerprint=450dce408658a7f873061345351176c7; buvid_fp=77344ae6cddf7b1d32d0c69bd9a38f80; b_lsid=4D7FCA4C_191DC589C5D; SESSDATA=8840b176%2C1741530686%2C6d032%2A92CjC46834ghJkHd4YjzNOtLiSAgKyx6hzATr7eWbatDJxB8pR708adiIvgr2OqCWEFkASVmluRUljMDZPQWlEMUx2MlVhQkJoanRvR1kxNlZ2M2hjS0hPekFrWGlNZFc2ZWNiWmdrQ0tyaWlkODFyNUp2WDdqc3FSbjUzUEhjNlFBc0U0UGR4a3d3IIEC; bili_jct=80e99b3d5026f86143f072c7e055fed8; sid=gbw623md; bp_t_offset_3461571130952367=975560108564021248; home_feed_column=4; browser_resolution=895-822",
'user-agent': '12345678zxdcvfbnmdfcsdxaghdnhguiasgck'
# "Mozilla/5.0 (111Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0
# Safari/537.36 Edg/128.0.0.0"
}
# 获取视频的aid列表:
def get_video_aid_list(keyword):
aid_set = set()
aid_list = []
page = 1
while len(aid_set) < 300:
url = f"https://api.bilibili.com/x/web-interface/search/all/v2?keyword={keyword}&page={page}"
response = requests.get(url, headers=header)
data = response.json()
for item in data['data']['result']:
if item['result_type'] == 'video':
for video in item['data']:
if video['aid'] not in aid_set:
aid_list.append({
'title': video['title'],
'aid': video['aid']
})
aid_set.add(video['aid'])
page += 1
time.sleep(0.15)
return aid_list
def get_cid_by_aid(aid):
url = f"https://api.bilibili.com/x/web-interface/view?aid={aid}"
response = requests.get(url, headers=header)
# print(response.text)
data = response.json()
return data['data']['cid']
def get_danmaku(cid):
url = f"https://comment.bilibili.com/{cid}.xml"
response = requests.get(url, headers=header)
# print(response.text)
soup = BeautifulSoup(response.content, "xml")
danmakus = soup.find_all('d')
return [d.text for d in danmakus]
def crawl_danmaku_data(keyword):
videos = get_video_aid_list(keyword)
all_danmakus = []
for video in videos:
aid = video['aid']
title = video['title']
# print(aid)
cid = get_cid_by_aid(aid)
print(f"正在爬取弹幕: {title} (AID: {aid}, CID: {cid})")
try:
danmakus = get_danmaku(cid)
all_danmakus.extend(danmakus)
# time.sleep(1)
except Exception as e:
print(f"获取视频 {title} 的弹幕时出错: {e}")
wb = openpyxl.Workbook()
sheet = wb.active
sheet.title = "Danmakus"
sheet.append(["弹幕内容"])
for item in all_danmakus:
sheet.append([item])
wb.save('danmakus.xlsx')
# print(all_danmakus)
# all_danmakus = all_danmakus[:-1]
return all_danmakus
ai_keywords = ["AI", "人工智能", "云技术", "机器学习", "深度学习", "神经网络","大数据" ]
def contains_pattern(text):
text1 = text.lower()
# pattern = r'|'.join(re.escape(keyword) for keyword in ai_keywords)
pattern = r'(?<![a-zA-Z])(?:' + '|'.join(re.escape(keyword) for keyword in ai_keywords) + r')(?![a-zA-Z])'
if re.search(pattern, text1):
return True
else:
return False
def ai_related_danmakus(danmakus):
ai_related_danmakus = []
for danmaku in danmakus:
if contains_pattern(danmaku):
ai_related_danmakus.append(danmaku)
# if contains_pattern(danmaku):
# print(danmaku)
# ai_related_danmakus.append(danmaku)
# print(f"AI 相关弹幕: {ai_related_danmakus}")
print(ai_related_danmakus)
# return word_counts.most_common(8)
return ai_related_danmakus
# 写入Excel:
def write_to_excel(data, file_name='ai_related_danmakus.xlsx'):
wb = openpyxl.Workbook()
sheet = wb.active
sheet.title = "AI Related Danmakus"
sheet.append(["弹幕内容", "出现次数"])
for item in data:
sheet.append([item[0], item[1]])
wb.save(file_name)
# 生成词云:
def generate_wordcloud(data):
wordcloud_text = ' '.join([item for item in data]) # (data)
wordcloud = WordCloud(font_path='msyh.ttc', width=800, height=400, background_color='white').generate(
wordcloud_text)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()
# 主函数:
if __name__ == "__main__":
keyword = "2024巴黎奥运会"
danmakus = crawl_danmaku_data(keyword)
print(f"共获取到 {len(danmakus)} 条弹幕数据。")
ai_anmakus = ai_related_danmakus(danmakus)
top_8_danmakus = Counter(ai_anmakus).most_common(8)
print("AI相关弹幕排名前8的数据")
print(top_8_danmakus)
write_to_excel(top_8_danmakus)
generate_wordcloud(ai_anmakus)