|
|
@ -6,28 +6,22 @@ import pandas as pd
|
|
|
|
from wordcloud import WordCloud
|
|
|
|
from wordcloud import WordCloud
|
|
|
|
import matplotlib.pyplot as plt
|
|
|
|
import matplotlib.pyplot as plt
|
|
|
|
|
|
|
|
|
|
|
|
query = "2024巴黎奥运会"
|
|
|
|
query = "巴黎奥运会"
|
|
|
|
headers = {
|
|
|
|
headers = {
|
|
|
|
"Cookie": "buvid3=F85083C9-B0B0-58EF-387E-9810D717FBD394717infoc; b_nut=1695630694; i-wanna-go-back=-1; b_ut=7; _uuid=4691069C1-57109-F951-5C2C-71061B15CAB9C93820infoc; buvid4=80C1A4DB-57B6-89F1-B7AB-7AE606C3BFB795506-023092516-b1nz50QSFWAVh9QAs1wBqg%3D%3D; DedeUserID=391260816; DedeUserID__ckMd5=874384c11cc311ca; hit-dyn-v2=1; rpdid=|(JlRYJ~Yk||0J'uYmlYJ|~mu; buvid_fp_plain=undefined; LIVE_BUVID=AUTO7816956505396915; is-2022-channel=1; enable_web_push=DISABLE; header_theme_version=CLOSE; FEED_LIVE_VERSION=V_WATCHLATER_PIP_WINDOW3; CURRENT_BLACKGAP=0; bp_video_offset_391260816=964407697698979840; CURRENT_FNVAL=4048; CURRENT_QUALITY=116; fingerprint=0caf6ff40a6d821a9253179cd16721cc; buvid_fp=daecdb2a27b0352be0af14099f69b721; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjU1MDk5MzUsImlhdCI6MTcyNTI1MDY3NSwicGx0IjotMX0.uE2PcZgAdDTBtqyfu7qsT_GKqNMsmsvjtdKYmeQ0eno; bili_ticket_expires=1725509875; SESSDATA=d4e31c61%2C1740843740%2Cc4b21%2A91CjBgFJe4MbiVSvKl_Z-oJcHfxPNmwxIX4iMw7S41V1DMuuAhaahCmSK6_p-xsyPHvC8SVi13bXN4RE40V2NCeGYwNWhYclNJckNfaGx4SzZydk05aE56ajdkS2dzZUVRWG9YeE5jbXFVdXF1aTZWTmxQZnRjZXZYaHJLU1dleElsRVczZG4wQW9RIIEC; bili_jct=f25b09f990746c712d4ef672d19e2628; PVID=1; sid=84brlx1u; home_feed_column=5; browser_resolution=2048-1018; bp_t_offset_391260816=973171033005621248; b_lsid=54110E26A_191BB365E57",
|
|
|
|
"Cookie": "buvid3=F85083C9-B0B0-58EF-387E-9810D717FBD394717infoc; b_nut=1695630694; i-wanna-go-back=-1; b_ut=7; _uuid=4691069C1-57109-F951-5C2C-71061B15CAB9C93820infoc; buvid4=80C1A4DB-57B6-89F1-B7AB-7AE606C3BFB795506-023092516-b1nz50QSFWAVh9QAs1wBqg%3D%3D; DedeUserID=391260816; DedeUserID__ckMd5=874384c11cc311ca; hit-dyn-v2=1; rpdid=|(JlRYJ~Yk||0J'uYmlYJ|~mu; buvid_fp_plain=undefined; LIVE_BUVID=AUTO7816956505396915; is-2022-channel=1; enable_web_push=DISABLE; header_theme_version=CLOSE; FEED_LIVE_VERSION=V_WATCHLATER_PIP_WINDOW3; CURRENT_BLACKGAP=0; bp_video_offset_391260816=964407697698979840; CURRENT_FNVAL=4048; CURRENT_QUALITY=116; fingerprint=0caf6ff40a6d821a9253179cd16721cc; buvid_fp=daecdb2a27b0352be0af14099f69b721; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjU1MDk5MzUsImlhdCI6MTcyNTI1MDY3NSwicGx0IjotMX0.uE2PcZgAdDTBtqyfu7qsT_GKqNMsmsvjtdKYmeQ0eno; bili_ticket_expires=1725509875; SESSDATA=d4e31c61%2C1740843740%2Cc4b21%2A91CjBgFJe4MbiVSvKl_Z-oJcHfxPNmwxIX4iMw7S41V1DMuuAhaahCmSK6_p-xsyPHvC8SVi13bXN4RE40V2NCeGYwNWhYclNJckNfaGx4SzZydk05aE56ajdkS2dzZUVRWG9YeE5jbXFVdXF1aTZWTmxQZnRjZXZYaHJLU1dleElsRVczZG4wQW9RIIEC; bili_jct=f25b09f990746c712d4ef672d19e2628; PVID=1; sid=84brlx1u; home_feed_column=5; browser_resolution=2048-1018; bp_t_offset_391260816=973171033005621248; b_lsid=54110E26A_191BB365E57",
|
|
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
|
|
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
total_page = 10
|
|
|
|
total_page = 10
|
|
|
|
cid_pattern = re.compile(r'"cid":(\d+)')
|
|
|
|
cid_pattern = re.compile(r'"cid":(\d+)')
|
|
|
|
total_cid_list = []
|
|
|
|
total_cid_list = []
|
|
|
|
total_comment_dict = {}
|
|
|
|
total_comment_dict = {}
|
|
|
|
bvid_pattern = re.compile(r'bvid:"(.*?)"')
|
|
|
|
bvid_pattern = re.compile(r'bvid:"(.*?)"')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def GetFirstBidUrl(): # 获取第一个视频的bid
|
|
|
|
def GetFirstBidUrl(): # 获取第一个视频的bid
|
|
|
|
return "https://search.bilibili.com/all?vt=82099157&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=5&page=2&o=36"
|
|
|
|
return "https://search.bilibili.com/all?vt=82099157&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=5&page=2&o=36"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def GetCid(): # 获取300个视频的 bvid
|
|
|
|
def GetCid(): # 获取300个视频的 bvid
|
|
|
|
for page in range(1, total_page + 1):
|
|
|
|
for page in range(1, total_page + 1):
|
|
|
|
if len(total_cid_list) >= 300:
|
|
|
|
if len(total_cid_list) >= 300:
|
|
|
|
break
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
|
|
print(f"Processing page {page}...\n", )
|
|
|
|
print(f"Processing page {page}...\n", )
|
|
|
|
start = time.time()
|
|
|
|
start = time.time()
|
|
|
|
if page == 1:
|
|
|
|
if page == 1:
|
|
|
@ -35,14 +29,10 @@ def GetCid(): # 获取300个视频的 bvid
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
search_url = f"https://search.bilibili.com/all?vt=82451961&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=5&page={page}&o=36"
|
|
|
|
search_url = f"https://search.bilibili.com/all?vt=82451961&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=5&page={page}&o=36"
|
|
|
|
respons = requests.get(search_url, headers=headers)
|
|
|
|
respons = requests.get(search_url, headers=headers)
|
|
|
|
|
|
|
|
|
|
|
|
current_bvid_list = bvid_pattern.findall(respons.text)
|
|
|
|
current_bvid_list = bvid_pattern.findall(respons.text)
|
|
|
|
|
|
|
|
|
|
|
|
end = time.time()
|
|
|
|
end = time.time()
|
|
|
|
print(f"获取bid用时{end - start}s\n")
|
|
|
|
print(f"获取bid用时{end - start}s\n")
|
|
|
|
|
|
|
|
|
|
|
|
start = time.time()
|
|
|
|
start = time.time()
|
|
|
|
|
|
|
|
|
|
|
|
# 通过bvid获取300个视频的cid
|
|
|
|
# 通过bvid获取300个视频的cid
|
|
|
|
for index, bvid in enumerate(current_bvid_list):
|
|
|
|
for index, bvid in enumerate(current_bvid_list):
|
|
|
|
video_url = f"https://www.bilibili.com/video/{bvid}"
|
|
|
|
video_url = f"https://www.bilibili.com/video/{bvid}"
|
|
|
@ -56,7 +46,6 @@ def GetCid(): # 获取300个视频的 bvid
|
|
|
|
end = time.time()
|
|
|
|
end = time.time()
|
|
|
|
print(f"获取cid用时:{end - start}s\n")
|
|
|
|
print(f"获取cid用时:{end - start}s\n")
|
|
|
|
time.sleep(1)
|
|
|
|
time.sleep(1)
|
|
|
|
|
|
|
|
|
|
|
|
def Getdanmu(): # 遍历所有视频的 cid,获取对应弹幕
|
|
|
|
def Getdanmu(): # 遍历所有视频的 cid,获取对应弹幕
|
|
|
|
get_cid_index = 0
|
|
|
|
get_cid_index = 0
|
|
|
|
for cid in total_cid_list:
|
|
|
|
for cid in total_cid_list:
|
|
|
@ -89,7 +78,7 @@ def Sortdanmu():
|
|
|
|
ai_comment[k] = v
|
|
|
|
ai_comment[k] = v
|
|
|
|
if ai_pattern2.search(k) and 'aiden' not in k and 'Adien' not in k:
|
|
|
|
if ai_pattern2.search(k) and 'aiden' not in k and 'Adien' not in k:
|
|
|
|
ai_comment[k] = v
|
|
|
|
ai_comment[k] = v
|
|
|
|
if '人工智能' in k:
|
|
|
|
if 'AI' in k:
|
|
|
|
ai_comment[k] = v
|
|
|
|
ai_comment[k] = v
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -105,8 +94,6 @@ def CreatWordCloud():
|
|
|
|
font_path='C:/Windows/Fonts/simsun.ttc',
|
|
|
|
font_path='C:/Windows/Fonts/simsun.ttc',
|
|
|
|
width=800, height=400,
|
|
|
|
width=800, height=400,
|
|
|
|
background_color='white',
|
|
|
|
background_color='white',
|
|
|
|
max_words=200,
|
|
|
|
|
|
|
|
colormap='viridis'
|
|
|
|
|
|
|
|
).generate(comment_text)
|
|
|
|
).generate(comment_text)
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
def main():
|
|
|
@ -115,6 +102,5 @@ def main():
|
|
|
|
Sortdanmu()
|
|
|
|
Sortdanmu()
|
|
|
|
CreatWordCloud()
|
|
|
|
CreatWordCloud()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
if __name__ == "__main__":
|
|
|
|
main()
|
|
|
|
main()
|