ADD file via upload

2 years ago · 8008c1fde5
parent 86321289b9
commit 8008c1fde5
1 changed files with 114 additions and 0 deletions
--- a/zuoye.py
+++ b/zuoye.py
@ -0,0 +1,114 @@
+import requests   
+import re   
+import time   
+from collections import Counter   
+import pandas as pd  # 导入pandas库用于数据处理
+from wordcloud import WordCloud   
+import matplotlib.pyplot as plt   
+import jieba   
+
+query = "2024巴黎奥运会"  # 设置查询关键词
+headers = {   
+    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
+}
+count = 300   
+total_page = 10   
+cid_pattern = re.compile(r'"cid":(\d+)')  # 编译正则表达式以提取cid
+total_cid_list = []   
+total_comment_dict = {}  
+bvid_pattern = re.compile(r'bvid:"(.*?)"')  # 编译正则表达式以提取bvid
+sorted_comment_dict = {}   
+
+def GetFirstBidUrl():  # 获取第一个搜索结果的URL
+    return "https://search.bilibili.com/all?vt=82099157&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=5&page=2&o=36"
+
+def fetch(url):  # 定义获取网页内容的函数
+    try:
+        response = requests.get(url, headers=headers)   
+        response.raise_for_status()   
+        return response   
+    except requests.RequestException as e:  
+        print(f"请求错误: {e}")   
+        return None  
+
+def GetCid():  # 获取视频的cid
+    for page in range(1, total_page + 1):   
+        if len(total_cid_list) >= count:   
+            break
+        
+        print(f"处理第{page}页...")  # 输出当前处理页码
+        search_url = GetFirstBidUrl() if page == 1 else f"https://search.bilibili.com/all?vt=82451961&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=5&page={page}&o=36"
+        
+        response = fetch(search_url)   
+        if response is None:   
+            continue
+        
+        current_bvid_list = bvid_pattern.findall(response.text)  # 从响应中提取bvid
+        
+        for bvid in current_bvid_list:  # 遍历每个bvid
+            video_url = f"https://www.bilibili.com/video/{bvid}"  
+            response = fetch(video_url)  
+            if response is None:   
+                continue
+            
+            current_cid = cid_pattern.search(response.text).group(1)  # 从视频页面提取cid
+            print(f"获取到cid: {current_cid}")  # 输出获取到的cid
+            total_cid_list.append(current_cid)   
+            if len(total_cid_list) >= count:  
+                break
+
+        time.sleep(1)  # 暂停1秒，避免请求频率过快
+
+def GetDanmu():  # 获取弹幕
+    for index, cid in enumerate(total_cid_list):  # 遍历每个cid
+        print(f"正在获取第{index + 1}个视频的弹幕")  # 输出当前处理的弹幕索引
+        danmu_url = f"https://api.bilibili.com/x/v1/dm/list.so?oid={cid}"  # 构造弹幕API的URL
+        response = fetch(danmu_url)   
+        if response is None:   
+            continue
+
+        response.encoding = 'utf-8'  # 设置响应编码为utf-8
+        current_danmu_list = re.findall('<d p=".*?">(.*?)</d>', response.text)  # 提取弹幕内容
+
+        current_comment_dict = Counter(current_danmu_list)  # 统计当前弹幕的频率
+
+        for k, v in current_comment_dict.items():  # 遍历当前弹幕统计
+            total_comment_dict[k] = total_comment_dict.get(k, 0) + v  # 更新总弹幕字典
+        
+        time.sleep(0.5)  # 暂停0.5秒，避免请求频率过快
+
+def SortDanmu():  # 对弹幕进行排序和过滤
+    global sorted_comment_dict  
+    ai_pattern = re.compile(r'(?<![a-zA-Z])ai(?![a-zA-Z])|人工智能', re.IGNORECASE)  # 编译正则表达式匹配“ai”或“人工智能”
+    ai_comment = {k: v for k, v in total_comment_dict.items() if ai_pattern.search(k)}  # 过滤出相关评论
+
+    sorted_comment_dict = dict(sorted(ai_comment.items(), key=lambda x: x[1], reverse=True))  # 按频率排序评论
+    print(sorted_comment_dict)  # 输出排序后的评论字典
+    
+    df = pd.DataFrame(list(sorted_comment_dict.items()), columns=['Comment', 'Count'])  # 转换为DataFrame
+    df.to_excel('comments.xlsx', index=False)  # 保存为Excel文件
+    print("已成功写入Excel文件") 
+
+def CreateWordCloud():   
+    comment_text = ' '.join([''.join(jieba.cut(k, cut_all=False)) for k in sorted_comment_dict.keys()])   
+    wordcloud = WordCloud(   
+        font_path='C:/Windows/Fonts/simsun.ttc',   
+        width=800, height=400,   
+        background_color='white',   
+        max_words=200,   
+        colormap='viridis'   
+    ).generate(comment_text)   
+
+    plt.figure(figsize=(10, 5))   
+    plt.imshow(wordcloud, interpolation='bilinear')  # 显示词云图
+    plt.axis('off')  
+    plt.show()  # 展示绘图
+
+def main():  # 主函数
+    GetCid()  # 获取cid
+    GetDanmu()  # 获取弹幕
+    SortDanmu()  # 排序弹幕
+    CreateWordCloud()  # 创建词云
+
+if __name__ == "__main__":   
+    main()