You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

120 lines
5.7 KiB

import requests
import re
import time
from collections import Counter
import pandas as pd
from wordcloud import WordCloud
import matplotlib.pyplot as plt
query = "2024巴黎奥运会"
headers = {
"Cookie": "buvid3=F85083C9-B0B0-58EF-387E-9810D717FBD394717infoc; b_nut=1695630694; i-wanna-go-back=-1; b_ut=7; _uuid=4691069C1-57109-F951-5C2C-71061B15CAB9C93820infoc; buvid4=80C1A4DB-57B6-89F1-B7AB-7AE606C3BFB795506-023092516-b1nz50QSFWAVh9QAs1wBqg%3D%3D; DedeUserID=391260816; DedeUserID__ckMd5=874384c11cc311ca; hit-dyn-v2=1; rpdid=|(JlRYJ~Yk||0J'uYmlYJ|~mu; buvid_fp_plain=undefined; LIVE_BUVID=AUTO7816956505396915; is-2022-channel=1; enable_web_push=DISABLE; header_theme_version=CLOSE; FEED_LIVE_VERSION=V_WATCHLATER_PIP_WINDOW3; CURRENT_BLACKGAP=0; bp_video_offset_391260816=964407697698979840; CURRENT_FNVAL=4048; CURRENT_QUALITY=116; fingerprint=0caf6ff40a6d821a9253179cd16721cc; buvid_fp=daecdb2a27b0352be0af14099f69b721; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjU1MDk5MzUsImlhdCI6MTcyNTI1MDY3NSwicGx0IjotMX0.uE2PcZgAdDTBtqyfu7qsT_GKqNMsmsvjtdKYmeQ0eno; bili_ticket_expires=1725509875; SESSDATA=d4e31c61%2C1740843740%2Cc4b21%2A91CjBgFJe4MbiVSvKl_Z-oJcHfxPNmwxIX4iMw7S41V1DMuuAhaahCmSK6_p-xsyPHvC8SVi13bXN4RE40V2NCeGYwNWhYclNJckNfaGx4SzZydk05aE56ajdkS2dzZUVRWG9YeE5jbXFVdXF1aTZWTmxQZnRjZXZYaHJLU1dleElsRVczZG4wQW9RIIEC; bili_jct=f25b09f990746c712d4ef672d19e2628; PVID=1; sid=84brlx1u; home_feed_column=5; browser_resolution=2048-1018; bp_t_offset_391260816=973171033005621248; b_lsid=54110E26A_191BB365E57",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
}
total_page = 10
cid_pattern = re.compile(r'"cid":(\d+)')
total_cid_list = []
total_comment_dict = {}
bvid_pattern = re.compile(r'bvid:"(.*?)"')
def GetFirstBidUrl(): # 获取第一个视频的bid
return "https://search.bilibili.com/all?vt=82099157&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=5&page=2&o=36"
def GetCid(): # 获取300个视频的 bvid
for page in range(1, total_page + 1):
if len(total_cid_list) >= 300:
break
print(f"Processing page {page}...\n", )
start = time.time()
if page == 1:
search_url = GetFirstBidUrl()
else:
search_url = f"https://search.bilibili.com/all?vt=82451961&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=5&page={page}&o=36"
respons = requests.get(search_url, headers=headers)
current_bvid_list = bvid_pattern.findall(respons.text)
end = time.time()
print(f"获取bid用时{end - start}s\n")
start = time.time()
# 通过bvid获取300个视频的cid
for index, bvid in enumerate(current_bvid_list):
video_url = f"https://www.bilibili.com/video/{bvid}"
respons = requests.get(video_url, headers=headers)
current_cid = cid_pattern.search(respons.text).group(1)
print(f"获取到第{len(total_cid_list) + 1}个cid:{current_cid}")
total_cid_list.append(current_cid)
if len(total_cid_list) >= 300:
break
# time.sleep(1)
end = time.time()
print(f"获取cid用时:{end - start}s\n")
time.sleep(1)
def Getdanmu(): # 遍历所有视频的 cid获取对应弹幕
get_cid_index = 0
for cid in total_cid_list:
get_cid_index += 1
print(f"正在获取第{get_cid_index}个视频的弹幕")
DanMu_url = f"https://api.bilibili.com/x/v1/dm/list.so?oid={cid}"
respons = requests.get(DanMu_url, headers=headers)
respons.encoding = 'utf-8'
current_danmu_list = re.findall('<d p=".*?">(.*?)</d>', respons.text)
current_comment_dict = {}
# 将每条弹幕加入到总的弹幕列表中
for danmu in current_danmu_list:
if danmu in current_comment_dict:
current_comment_dict[danmu] += 1
else:
current_comment_dict[danmu] = 1
for k, v in current_comment_dict.items():
if k in total_comment_dict:
total_comment_dict[k] += v
else:
total_comment_dict[k] = v
time.sleep(0.5)
# 在得到的弹幕里筛选与ai相关的弹幕
def Sortdanmu():
ai_pattern1 = re.compile(r'ai[\u4e00-\u9fff]', re.IGNORECASE)
ai_pattern2 = re.compile(r'[\u4e00-\u9fff]ai', re.IGNORECASE)
ai_comment = {}
for k, v in total_comment_dict.items():
if ai_pattern1.search(k) and 'aiden' not in k and 'Aiden' not in k:
ai_comment[k] = v
if ai_pattern2.search(k) and 'aiden' not in k and 'Adien' not in k:
ai_comment[k] = v
if '人工智能' in k:
ai_comment[k] = v
global sorted_comment_dict
sorted_comment_dict = dict(sorted(ai_comment.items(), key=lambda x: x[1], reverse=True))
print(sorted_comment_dict)
df = pd.DataFrame(list(sorted_comment_dict.items()), columns=['Comment', 'Count'])
df.to_excel('comments.xlsx', index=False)
def CreatWordCloud():
# 根据弹幕表格生成词云图
comment_text = ' '.join([((k + ' ') * v) for k, v in sorted_comment_dict.items()])
wordcloud = WordCloud(
font_path='C:/Windows/Fonts/simsun.ttc',
width=800, height=400,
background_color='white',
max_words=200,
colormap='viridis'
).generate(comment_text)
def main():
GetCid()
Getdanmu()
Sortdanmu()
CreatWordCloud()
if __name__ == "__main__":
main()