Update python.py

10 months ago · 67bb98f033
parent c1c5700cd5
commit 67bb98f033
1 changed files with 105 additions and 119 deletions
--- a/python.py
+++ b/python.py
@ -1,120 +1,106 @@
-import requests
+import requests
-import re
+import re
-import time
+import time
-from collections import Counter
+from collections import Counter
-import pandas as pd
+import pandas as pd
-from wordcloud import WordCloud
+from wordcloud import WordCloud
-import matplotlib.pyplot as plt
+import matplotlib.pyplot as plt
-
+
-query = "2024巴黎奥运会"
+query = "巴黎奥运会"
-headers = {
+headers = {
-    "Cookie": "buvid3=F85083C9-B0B0-58EF-387E-9810D717FBD394717infoc; b_nut=1695630694; i-wanna-go-back=-1; b_ut=7; _uuid=4691069C1-57109-F951-5C2C-71061B15CAB9C93820infoc; buvid4=80C1A4DB-57B6-89F1-B7AB-7AE606C3BFB795506-023092516-b1nz50QSFWAVh9QAs1wBqg%3D%3D; DedeUserID=391260816; DedeUserID__ckMd5=874384c11cc311ca; hit-dyn-v2=1; rpdid=|(JlRYJ~Yk||0J'uYmlYJ|~mu; buvid_fp_plain=undefined; LIVE_BUVID=AUTO7816956505396915; is-2022-channel=1; enable_web_push=DISABLE; header_theme_version=CLOSE; FEED_LIVE_VERSION=V_WATCHLATER_PIP_WINDOW3; CURRENT_BLACKGAP=0; bp_video_offset_391260816=964407697698979840; CURRENT_FNVAL=4048; CURRENT_QUALITY=116; fingerprint=0caf6ff40a6d821a9253179cd16721cc; buvid_fp=daecdb2a27b0352be0af14099f69b721; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjU1MDk5MzUsImlhdCI6MTcyNTI1MDY3NSwicGx0IjotMX0.uE2PcZgAdDTBtqyfu7qsT_GKqNMsmsvjtdKYmeQ0eno; bili_ticket_expires=1725509875; SESSDATA=d4e31c61%2C1740843740%2Cc4b21%2A91CjBgFJe4MbiVSvKl_Z-oJcHfxPNmwxIX4iMw7S41V1DMuuAhaahCmSK6_p-xsyPHvC8SVi13bXN4RE40V2NCeGYwNWhYclNJckNfaGx4SzZydk05aE56ajdkS2dzZUVRWG9YeE5jbXFVdXF1aTZWTmxQZnRjZXZYaHJLU1dleElsRVczZG4wQW9RIIEC; bili_jct=f25b09f990746c712d4ef672d19e2628; PVID=1; sid=84brlx1u; home_feed_column=5; browser_resolution=2048-1018; bp_t_offset_391260816=973171033005621248; b_lsid=54110E26A_191BB365E57",
+    "Cookie": "buvid3=F85083C9-B0B0-58EF-387E-9810D717FBD394717infoc; b_nut=1695630694; i-wanna-go-back=-1; b_ut=7; _uuid=4691069C1-57109-F951-5C2C-71061B15CAB9C93820infoc; buvid4=80C1A4DB-57B6-89F1-B7AB-7AE606C3BFB795506-023092516-b1nz50QSFWAVh9QAs1wBqg%3D%3D; DedeUserID=391260816; DedeUserID__ckMd5=874384c11cc311ca; hit-dyn-v2=1; rpdid=|(JlRYJ~Yk||0J'uYmlYJ|~mu; buvid_fp_plain=undefined; LIVE_BUVID=AUTO7816956505396915; is-2022-channel=1; enable_web_push=DISABLE; header_theme_version=CLOSE; FEED_LIVE_VERSION=V_WATCHLATER_PIP_WINDOW3; CURRENT_BLACKGAP=0; bp_video_offset_391260816=964407697698979840; CURRENT_FNVAL=4048; CURRENT_QUALITY=116; fingerprint=0caf6ff40a6d821a9253179cd16721cc; buvid_fp=daecdb2a27b0352be0af14099f69b721; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjU1MDk5MzUsImlhdCI6MTcyNTI1MDY3NSwicGx0IjotMX0.uE2PcZgAdDTBtqyfu7qsT_GKqNMsmsvjtdKYmeQ0eno; bili_ticket_expires=1725509875; SESSDATA=d4e31c61%2C1740843740%2Cc4b21%2A91CjBgFJe4MbiVSvKl_Z-oJcHfxPNmwxIX4iMw7S41V1DMuuAhaahCmSK6_p-xsyPHvC8SVi13bXN4RE40V2NCeGYwNWhYclNJckNfaGx4SzZydk05aE56ajdkS2dzZUVRWG9YeE5jbXFVdXF1aTZWTmxQZnRjZXZYaHJLU1dleElsRVczZG4wQW9RIIEC; bili_jct=f25b09f990746c712d4ef672d19e2628; PVID=1; sid=84brlx1u; home_feed_column=5; browser_resolution=2048-1018; bp_t_offset_391260816=973171033005621248; b_lsid=54110E26A_191BB365E57",
-    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
+    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
-}
+}
-
+total_page = 10
-total_page = 10
+cid_pattern = re.compile(r'"cid":(\d+)')
-cid_pattern = re.compile(r'"cid":(\d+)')
+total_cid_list = []
-total_cid_list = []
+total_comment_dict = {}
-total_comment_dict = {}
+bvid_pattern = re.compile(r'bvid:"(.*?)"')
-bvid_pattern = re.compile(r'bvid:"(.*?)"')
+def GetFirstBidUrl():  # 获取第一个视频的bid
-
+    return "https://search.bilibili.com/all?vt=82099157&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=5&page=2&o=36"
-
+def GetCid():  # 获取300个视频的 bvid
-def GetFirstBidUrl():  # 获取第一个视频的bid
+    for page in range(1, total_page + 1):
-    return "https://search.bilibili.com/all?vt=82099157&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=5&page=2&o=36"
+        if len(total_cid_list) >= 300:
-
+            break
-
+        print(f"Processing page {page}...\n", )
-def GetCid():  # 获取300个视频的 bvid
+        start = time.time()
-    for page in range(1, total_page + 1):
+        if page == 1:
-        if len(total_cid_list) >= 300:
+            search_url = GetFirstBidUrl()
-            break
+        else:
-
+            search_url = f"https://search.bilibili.com/all?vt=82451961&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=5&page={page}&o=36"
-        print(f"Processing page {page}...\n", )
+        respons = requests.get(search_url, headers=headers)
-        start = time.time()
+        current_bvid_list = bvid_pattern.findall(respons.text)
-        if page == 1:
+        end = time.time()
-            search_url = GetFirstBidUrl()
+        print(f"获取bid用时{end - start}s\n")
-        else:
+        start = time.time()
-            search_url = f"https://search.bilibili.com/all?vt=82451961&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=5&page={page}&o=36"
+        # 通过bvid获取300个视频的cid
-        respons = requests.get(search_url, headers=headers)
+        for index, bvid in enumerate(current_bvid_list):
-
+            video_url = f"https://www.bilibili.com/video/{bvid}"
-        current_bvid_list = bvid_pattern.findall(respons.text)
+            respons = requests.get(video_url, headers=headers)
-
+            current_cid = cid_pattern.search(respons.text).group(1)
-        end = time.time()
+            print(f"获取到第{len(total_cid_list) + 1}个cid:{current_cid}")
-        print(f"获取bid用时{end - start}s\n")
+            total_cid_list.append(current_cid)
-
+            if len(total_cid_list) >= 300:
-        start = time.time()
+                break
-
+        # time.sleep(1)
-        # 通过bvid获取300个视频的cid
+        end = time.time()
-        for index, bvid in enumerate(current_bvid_list):
+        print(f"获取cid用时:{end - start}s\n")
-            video_url = f"https://www.bilibili.com/video/{bvid}"
+        time.sleep(1)
-            respons = requests.get(video_url, headers=headers)
+def Getdanmu():  # 遍历所有视频的 cid，获取对应弹幕
-            current_cid = cid_pattern.search(respons.text).group(1)
+    get_cid_index = 0
-            print(f"获取到第{len(total_cid_list) + 1}个cid:{current_cid}")
+    for cid in total_cid_list:
-            total_cid_list.append(current_cid)
+        get_cid_index += 1
-            if len(total_cid_list) >= 300:
+        print(f"正在获取第{get_cid_index}个视频的弹幕")
-                break
+        DanMu_url = f"https://api.bilibili.com/x/v1/dm/list.so?oid={cid}"
-        # time.sleep(1)
+        respons = requests.get(DanMu_url, headers=headers)
-        end = time.time()
+        respons.encoding = 'utf-8'
-        print(f"获取cid用时:{end - start}s\n")
+        current_danmu_list = re.findall('<d p=".*?">(.*?)</d>', respons.text)
-        time.sleep(1)
+        current_comment_dict = {}
-
+        # 将每条弹幕加入到总的弹幕列表中
-def Getdanmu():  # 遍历所有视频的 cid，获取对应弹幕
+        for danmu in current_danmu_list:
-    get_cid_index = 0
+            if danmu in current_comment_dict:
-    for cid in total_cid_list:
+                current_comment_dict[danmu] += 1
-        get_cid_index += 1
+            else:
-        print(f"正在获取第{get_cid_index}个视频的弹幕")
+                current_comment_dict[danmu] = 1
-        DanMu_url = f"https://api.bilibili.com/x/v1/dm/list.so?oid={cid}"
+        for k, v in current_comment_dict.items():
-        respons = requests.get(DanMu_url, headers=headers)
+            if k in total_comment_dict:
-        respons.encoding = 'utf-8'
+                total_comment_dict[k] += v
-        current_danmu_list = re.findall('<d p=".*?">(.*?)</d>', respons.text)
+            else:
-        current_comment_dict = {}
+                total_comment_dict[k] = v
-        # 将每条弹幕加入到总的弹幕列表中
+        time.sleep(0.5)
-        for danmu in current_danmu_list:
+ # 在得到的弹幕里筛选与ai相关的弹幕
-            if danmu in current_comment_dict:
+def Sortdanmu():
-                current_comment_dict[danmu] += 1
+    ai_pattern1 = re.compile(r'ai[\u4e00-\u9fff]', re.IGNORECASE)
-            else:
+    ai_pattern2 = re.compile(r'[\u4e00-\u9fff]ai', re.IGNORECASE)
-                current_comment_dict[danmu] = 1
+    ai_comment = {}
-        for k, v in current_comment_dict.items():
+    for k, v in total_comment_dict.items():
-            if k in total_comment_dict:
+        if ai_pattern1.search(k) and 'aiden' not in k and 'Aiden' not in k:
-                total_comment_dict[k] += v
+            ai_comment[k] = v
-            else:
+        if ai_pattern2.search(k) and 'aiden' not in k and 'Adien' not in k:
-                total_comment_dict[k] = v
+            ai_comment[k] = v
-        time.sleep(0.5)
+        if 'AI' in k:
- # 在得到的弹幕里筛选与ai相关的弹幕
+            ai_comment[k] = v
-def Sortdanmu():
+
-    ai_pattern1 = re.compile(r'ai[\u4e00-\u9fff]', re.IGNORECASE)
+
-    ai_pattern2 = re.compile(r'[\u4e00-\u9fff]ai', re.IGNORECASE)
+    global sorted_comment_dict
-    ai_comment = {}
+    sorted_comment_dict = dict(sorted(ai_comment.items(), key=lambda x: x[1], reverse=True))
-    for k, v in total_comment_dict.items():
+    print(sorted_comment_dict)
-        if ai_pattern1.search(k) and 'aiden' not in k and 'Aiden' not in k:
+    df = pd.DataFrame(list(sorted_comment_dict.items()), columns=['Comment', 'Count'])
-            ai_comment[k] = v
+    df.to_excel('comments.xlsx', index=False)
-        if ai_pattern2.search(k) and 'aiden' not in k and 'Adien' not in k:
+def CreatWordCloud():
-            ai_comment[k] = v
+    # 根据弹幕表格生成词云图
-        if '人工智能' in k:
+    comment_text = ' '.join([((k + ' ') * v) for k, v in sorted_comment_dict.items()])
-            ai_comment[k] = v
+    wordcloud = WordCloud(
-
+        font_path='C:/Windows/Fonts/simsun.ttc',
-
+        width=800, height=400,
-    global sorted_comment_dict
+        background_color='white',
-    sorted_comment_dict = dict(sorted(ai_comment.items(), key=lambda x: x[1], reverse=True))
+    ).generate(comment_text)
-    print(sorted_comment_dict)
+
-    df = pd.DataFrame(list(sorted_comment_dict.items()), columns=['Comment', 'Count'])
+def main():
-    df.to_excel('comments.xlsx', index=False)
+    GetCid()
-def CreatWordCloud():
+    Getdanmu()
-    # 根据弹幕表格生成词云图
+    Sortdanmu()
-    comment_text = ' '.join([((k + ' ') * v) for k, v in sorted_comment_dict.items()])
+    CreatWordCloud()
-    wordcloud = WordCloud(
+
-        font_path='C:/Windows/Fonts/simsun.ttc',
+if __name__ == "__main__":
        width=800, height=400,
        background_color='white',
        max_words=200,
        colormap='viridis'
    ).generate(comment_text)
 def main():
    GetCid()
    Getdanmu()
    Sortdanmu()
    CreatWordCloud()
 if __name__ == "__main__":
    main()