Update python.py

main
ppfc5brxg 8 months ago
parent c1c5700cd5
commit 67bb98f033

@ -1,120 +1,106 @@
import requests import requests
import re import re
import time import time
from collections import Counter from collections import Counter
import pandas as pd import pandas as pd
from wordcloud import WordCloud from wordcloud import WordCloud
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
query = "2024巴黎奥运会" query = "巴黎奥运会"
headers = { headers = {
"Cookie": "buvid3=F85083C9-B0B0-58EF-387E-9810D717FBD394717infoc; b_nut=1695630694; i-wanna-go-back=-1; b_ut=7; _uuid=4691069C1-57109-F951-5C2C-71061B15CAB9C93820infoc; buvid4=80C1A4DB-57B6-89F1-B7AB-7AE606C3BFB795506-023092516-b1nz50QSFWAVh9QAs1wBqg%3D%3D; DedeUserID=391260816; DedeUserID__ckMd5=874384c11cc311ca; hit-dyn-v2=1; rpdid=|(JlRYJ~Yk||0J'uYmlYJ|~mu; buvid_fp_plain=undefined; LIVE_BUVID=AUTO7816956505396915; is-2022-channel=1; enable_web_push=DISABLE; header_theme_version=CLOSE; FEED_LIVE_VERSION=V_WATCHLATER_PIP_WINDOW3; CURRENT_BLACKGAP=0; bp_video_offset_391260816=964407697698979840; CURRENT_FNVAL=4048; CURRENT_QUALITY=116; fingerprint=0caf6ff40a6d821a9253179cd16721cc; buvid_fp=daecdb2a27b0352be0af14099f69b721; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjU1MDk5MzUsImlhdCI6MTcyNTI1MDY3NSwicGx0IjotMX0.uE2PcZgAdDTBtqyfu7qsT_GKqNMsmsvjtdKYmeQ0eno; bili_ticket_expires=1725509875; SESSDATA=d4e31c61%2C1740843740%2Cc4b21%2A91CjBgFJe4MbiVSvKl_Z-oJcHfxPNmwxIX4iMw7S41V1DMuuAhaahCmSK6_p-xsyPHvC8SVi13bXN4RE40V2NCeGYwNWhYclNJckNfaGx4SzZydk05aE56ajdkS2dzZUVRWG9YeE5jbXFVdXF1aTZWTmxQZnRjZXZYaHJLU1dleElsRVczZG4wQW9RIIEC; bili_jct=f25b09f990746c712d4ef672d19e2628; PVID=1; sid=84brlx1u; home_feed_column=5; browser_resolution=2048-1018; bp_t_offset_391260816=973171033005621248; b_lsid=54110E26A_191BB365E57", "Cookie": "buvid3=F85083C9-B0B0-58EF-387E-9810D717FBD394717infoc; b_nut=1695630694; i-wanna-go-back=-1; b_ut=7; _uuid=4691069C1-57109-F951-5C2C-71061B15CAB9C93820infoc; buvid4=80C1A4DB-57B6-89F1-B7AB-7AE606C3BFB795506-023092516-b1nz50QSFWAVh9QAs1wBqg%3D%3D; DedeUserID=391260816; DedeUserID__ckMd5=874384c11cc311ca; hit-dyn-v2=1; rpdid=|(JlRYJ~Yk||0J'uYmlYJ|~mu; buvid_fp_plain=undefined; LIVE_BUVID=AUTO7816956505396915; is-2022-channel=1; enable_web_push=DISABLE; header_theme_version=CLOSE; FEED_LIVE_VERSION=V_WATCHLATER_PIP_WINDOW3; CURRENT_BLACKGAP=0; bp_video_offset_391260816=964407697698979840; CURRENT_FNVAL=4048; CURRENT_QUALITY=116; fingerprint=0caf6ff40a6d821a9253179cd16721cc; buvid_fp=daecdb2a27b0352be0af14099f69b721; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjU1MDk5MzUsImlhdCI6MTcyNTI1MDY3NSwicGx0IjotMX0.uE2PcZgAdDTBtqyfu7qsT_GKqNMsmsvjtdKYmeQ0eno; bili_ticket_expires=1725509875; SESSDATA=d4e31c61%2C1740843740%2Cc4b21%2A91CjBgFJe4MbiVSvKl_Z-oJcHfxPNmwxIX4iMw7S41V1DMuuAhaahCmSK6_p-xsyPHvC8SVi13bXN4RE40V2NCeGYwNWhYclNJckNfaGx4SzZydk05aE56ajdkS2dzZUVRWG9YeE5jbXFVdXF1aTZWTmxQZnRjZXZYaHJLU1dleElsRVczZG4wQW9RIIEC; bili_jct=f25b09f990746c712d4ef672d19e2628; PVID=1; sid=84brlx1u; home_feed_column=5; browser_resolution=2048-1018; bp_t_offset_391260816=973171033005621248; b_lsid=54110E26A_191BB365E57",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36" "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
} }
total_page = 10
total_page = 10 cid_pattern = re.compile(r'"cid":(\d+)')
cid_pattern = re.compile(r'"cid":(\d+)') total_cid_list = []
total_cid_list = [] total_comment_dict = {}
total_comment_dict = {} bvid_pattern = re.compile(r'bvid:"(.*?)"')
bvid_pattern = re.compile(r'bvid:"(.*?)"') def GetFirstBidUrl(): # 获取第一个视频的bid
return "https://search.bilibili.com/all?vt=82099157&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=5&page=2&o=36"
def GetCid(): # 获取300个视频的 bvid
def GetFirstBidUrl(): # 获取第一个视频的bid for page in range(1, total_page + 1):
return "https://search.bilibili.com/all?vt=82099157&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=5&page=2&o=36" if len(total_cid_list) >= 300:
break
print(f"Processing page {page}...\n", )
def GetCid(): # 获取300个视频的 bvid start = time.time()
for page in range(1, total_page + 1): if page == 1:
if len(total_cid_list) >= 300: search_url = GetFirstBidUrl()
break else:
search_url = f"https://search.bilibili.com/all?vt=82451961&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=5&page={page}&o=36"
print(f"Processing page {page}...\n", ) respons = requests.get(search_url, headers=headers)
start = time.time() current_bvid_list = bvid_pattern.findall(respons.text)
if page == 1: end = time.time()
search_url = GetFirstBidUrl() print(f"获取bid用时{end - start}s\n")
else: start = time.time()
search_url = f"https://search.bilibili.com/all?vt=82451961&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=5&page={page}&o=36" # 通过bvid获取300个视频的cid
respons = requests.get(search_url, headers=headers) for index, bvid in enumerate(current_bvid_list):
video_url = f"https://www.bilibili.com/video/{bvid}"
current_bvid_list = bvid_pattern.findall(respons.text) respons = requests.get(video_url, headers=headers)
current_cid = cid_pattern.search(respons.text).group(1)
end = time.time() print(f"获取到第{len(total_cid_list) + 1}个cid:{current_cid}")
print(f"获取bid用时{end - start}s\n") total_cid_list.append(current_cid)
if len(total_cid_list) >= 300:
start = time.time() break
# time.sleep(1)
# 通过bvid获取300个视频的cid end = time.time()
for index, bvid in enumerate(current_bvid_list): print(f"获取cid用时:{end - start}s\n")
video_url = f"https://www.bilibili.com/video/{bvid}" time.sleep(1)
respons = requests.get(video_url, headers=headers) def Getdanmu(): # 遍历所有视频的 cid获取对应弹幕
current_cid = cid_pattern.search(respons.text).group(1) get_cid_index = 0
print(f"获取到第{len(total_cid_list) + 1}个cid:{current_cid}") for cid in total_cid_list:
total_cid_list.append(current_cid) get_cid_index += 1
if len(total_cid_list) >= 300: print(f"正在获取第{get_cid_index}个视频的弹幕")
break DanMu_url = f"https://api.bilibili.com/x/v1/dm/list.so?oid={cid}"
# time.sleep(1) respons = requests.get(DanMu_url, headers=headers)
end = time.time() respons.encoding = 'utf-8'
print(f"获取cid用时:{end - start}s\n") current_danmu_list = re.findall('<d p=".*?">(.*?)</d>', respons.text)
time.sleep(1) current_comment_dict = {}
# 将每条弹幕加入到总的弹幕列表中
def Getdanmu(): # 遍历所有视频的 cid获取对应弹幕 for danmu in current_danmu_list:
get_cid_index = 0 if danmu in current_comment_dict:
for cid in total_cid_list: current_comment_dict[danmu] += 1
get_cid_index += 1 else:
print(f"正在获取第{get_cid_index}个视频的弹幕") current_comment_dict[danmu] = 1
DanMu_url = f"https://api.bilibili.com/x/v1/dm/list.so?oid={cid}" for k, v in current_comment_dict.items():
respons = requests.get(DanMu_url, headers=headers) if k in total_comment_dict:
respons.encoding = 'utf-8' total_comment_dict[k] += v
current_danmu_list = re.findall('<d p=".*?">(.*?)</d>', respons.text) else:
current_comment_dict = {} total_comment_dict[k] = v
# 将每条弹幕加入到总的弹幕列表中 time.sleep(0.5)
for danmu in current_danmu_list: # 在得到的弹幕里筛选与ai相关的弹幕
if danmu in current_comment_dict: def Sortdanmu():
current_comment_dict[danmu] += 1 ai_pattern1 = re.compile(r'ai[\u4e00-\u9fff]', re.IGNORECASE)
else: ai_pattern2 = re.compile(r'[\u4e00-\u9fff]ai', re.IGNORECASE)
current_comment_dict[danmu] = 1 ai_comment = {}
for k, v in current_comment_dict.items(): for k, v in total_comment_dict.items():
if k in total_comment_dict: if ai_pattern1.search(k) and 'aiden' not in k and 'Aiden' not in k:
total_comment_dict[k] += v ai_comment[k] = v
else: if ai_pattern2.search(k) and 'aiden' not in k and 'Adien' not in k:
total_comment_dict[k] = v ai_comment[k] = v
time.sleep(0.5) if 'AI' in k:
# 在得到的弹幕里筛选与ai相关的弹幕 ai_comment[k] = v
def Sortdanmu():
ai_pattern1 = re.compile(r'ai[\u4e00-\u9fff]', re.IGNORECASE)
ai_pattern2 = re.compile(r'[\u4e00-\u9fff]ai', re.IGNORECASE) global sorted_comment_dict
ai_comment = {} sorted_comment_dict = dict(sorted(ai_comment.items(), key=lambda x: x[1], reverse=True))
for k, v in total_comment_dict.items(): print(sorted_comment_dict)
if ai_pattern1.search(k) and 'aiden' not in k and 'Aiden' not in k: df = pd.DataFrame(list(sorted_comment_dict.items()), columns=['Comment', 'Count'])
ai_comment[k] = v df.to_excel('comments.xlsx', index=False)
if ai_pattern2.search(k) and 'aiden' not in k and 'Adien' not in k: def CreatWordCloud():
ai_comment[k] = v # 根据弹幕表格生成词云图
if '人工智能' in k: comment_text = ' '.join([((k + ' ') * v) for k, v in sorted_comment_dict.items()])
ai_comment[k] = v wordcloud = WordCloud(
font_path='C:/Windows/Fonts/simsun.ttc',
width=800, height=400,
global sorted_comment_dict background_color='white',
sorted_comment_dict = dict(sorted(ai_comment.items(), key=lambda x: x[1], reverse=True)) ).generate(comment_text)
print(sorted_comment_dict)
df = pd.DataFrame(list(sorted_comment_dict.items()), columns=['Comment', 'Count']) def main():
df.to_excel('comments.xlsx', index=False) GetCid()
def CreatWordCloud(): Getdanmu()
# 根据弹幕表格生成词云图 Sortdanmu()
comment_text = ' '.join([((k + ' ') * v) for k, v in sorted_comment_dict.items()]) CreatWordCloud()
wordcloud = WordCloud(
font_path='C:/Windows/Fonts/simsun.ttc', if __name__ == "__main__":
width=800, height=400,
background_color='white',
max_words=200,
colormap='viridis'
).generate(comment_text)
def main():
GetCid()
Getdanmu()
Sortdanmu()
CreatWordCloud()
if __name__ == "__main__":
main() main()
Loading…
Cancel
Save