diff --git a/zuoye.py b/zuoye.py new file mode 100644 index 0000000..708303b --- /dev/null +++ b/zuoye.py @@ -0,0 +1,114 @@ +import requests +import re +import time +from collections import Counter +import pandas as pd # 导入pandas库用于数据处理 +from wordcloud import WordCloud +import matplotlib.pyplot as plt +import jieba + +query = "2024巴黎奥运会" # 设置查询关键词 +headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36" +} +count = 300 +total_page = 10 +cid_pattern = re.compile(r'"cid":(\d+)') # 编译正则表达式以提取cid +total_cid_list = [] +total_comment_dict = {} +bvid_pattern = re.compile(r'bvid:"(.*?)"') # 编译正则表达式以提取bvid +sorted_comment_dict = {} + +def GetFirstBidUrl(): # 获取第一个搜索结果的URL + return "https://search.bilibili.com/all?vt=82099157&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=5&page=2&o=36" + +def fetch(url): # 定义获取网页内容的函数 + try: + response = requests.get(url, headers=headers) + response.raise_for_status() + return response + except requests.RequestException as e: + print(f"请求错误: {e}") + return None + +def GetCid(): # 获取视频的cid + for page in range(1, total_page + 1): + if len(total_cid_list) >= count: + break + + print(f"处理第{page}页...") # 输出当前处理页码 + search_url = GetFirstBidUrl() if page == 1 else f"https://search.bilibili.com/all?vt=82451961&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=5&page={page}&o=36" + + response = fetch(search_url) + if response is None: + continue + + current_bvid_list = bvid_pattern.findall(response.text) # 从响应中提取bvid + + for bvid in current_bvid_list: # 遍历每个bvid + video_url = f"https://www.bilibili.com/video/{bvid}" + response = fetch(video_url) + if response is None: + continue + + current_cid = cid_pattern.search(response.text).group(1) # 从视频页面提取cid + print(f"获取到cid: {current_cid}") # 输出获取到的cid + total_cid_list.append(current_cid) + if len(total_cid_list) >= count: + break + + time.sleep(1) # 暂停1秒,避免请求频率过快 + +def GetDanmu(): # 获取弹幕 + for index, cid in enumerate(total_cid_list): # 遍历每个cid + print(f"正在获取第{index + 1}个视频的弹幕") # 输出当前处理的弹幕索引 + danmu_url = f"https://api.bilibili.com/x/v1/dm/list.so?oid={cid}" # 构造弹幕API的URL + response = fetch(danmu_url) + if response is None: + continue + + response.encoding = 'utf-8' # 设置响应编码为utf-8 + current_danmu_list = re.findall('(.*?)', response.text) # 提取弹幕内容 + + current_comment_dict = Counter(current_danmu_list) # 统计当前弹幕的频率 + + for k, v in current_comment_dict.items(): # 遍历当前弹幕统计 + total_comment_dict[k] = total_comment_dict.get(k, 0) + v # 更新总弹幕字典 + + time.sleep(0.5) # 暂停0.5秒,避免请求频率过快 + +def SortDanmu(): # 对弹幕进行排序和过滤 + global sorted_comment_dict + ai_pattern = re.compile(r'(?