From 650aa48e0c50ff3934a382972cc50861b74b8171 Mon Sep 17 00:00:00 2001 From: pks3rafu8 <3475354914@qq.com> Date: Wed, 18 Sep 2024 22:16:34 +0800 Subject: [PATCH] ADD file via upload --- MyBullet.py | 167 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 167 insertions(+) create mode 100644 MyBullet.py diff --git a/MyBullet.py b/MyBullet.py new file mode 100644 index 0000000..3f4b8e0 --- /dev/null +++ b/MyBullet.py @@ -0,0 +1,167 @@ +import re +import requests +from bs4 import BeautifulSoup +from collections import Counter +import pandas as pd + +# 定义需要爬取的视频数量和搜索内容 +BV_NUM = 300 # 需要获取的视频数量 +SEARCH_CONTENT = "2024巴黎奥运会" # 搜索关键词 + +# 请求头,防止反爬 +HEADERS = { + 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 Edg/126.0.0.0', + "Referer": "https://search.bilibili.com/all?" +} + +# 根据搜索关键词,爬取指定数量的视频BVID +def get_bv(num): + """ + 参数: num: 需要获取的BVID数量 + 返回值: bv_list, 一个包含视频BVID的集合 + """ + bv_list = set() # 使用集合存储获取的BVID,以去重 + page = 1 # 初始化页码 + + while len(bv_list) < num: + # 构造搜索页面的URL + search_url = f"https://search.bilibili.com/all?keyword={SEARCH_CONTENT}&page={page}" + response = requests.get(search_url, headers=HEADERS) + + # 使用正则表达式提取BVID + pattern = re.compile(r'aid:.*?bvid:"(?P.*?)",') + matches = pattern.finditer(response.text) + + # 将BVID加入集合 + for match in matches: + bv_list.add(match.group("bvs")) + # 如果达到了指定数量,直接返回结果 + if len(bv_list) >= num: + return bv_list + + # 增加页码,继续爬取下一页 + page += 1 + + return bv_list + +# 通过bv号获取视频cid,进一步获取弹幕内容 +def fetch_bullet_screen(bv_list): + """ + 参数 bv_list: 包含BV号的列表 + 返回值: 弹幕内容列表 + """ + my_bullet = [] # 存放所有弹幕的列表 + for bv in bv_list: + cid_url = f"https://api.bilibili.com/x/player/pagelist?bvid={bv}&jsonp=jsonp" + response_cid = requests.get(cid_url, headers=HEADERS) + response_cid_json = response_cid.json() + + # 获取视频的cid + cid = response_cid_json.get("data", [{}])[0].get("cid") + if not cid: + print(f"无法获取 {bv} 的cid") + continue + + # 获取对应cid的弹幕内容 + response_bullet = requests.get(f"https://comment.bilibili.com/{cid}.xml", headers=HEADERS) + response_bullet.encoding = "utf-8" + + # 解析XML格式的弹幕数据 + soup = BeautifulSoup(response_bullet.text, "xml") + danmus = soup.find_all("d") + + # 将弹幕内容添加到列表中 + my_bullet.extend([danmu.text for danmu in danmus]) + print(f"已成功爬取 {bv} 的弹幕") + + return my_bullet + +# 分析弹幕内容,提取包含AI相关词语的完整句子 +def analyze_bullet_screen_with_ai_sentences(my_bullet , ai_keywords): + """ + 参数1: my_bullet: 弹幕内容列表 + 参数2: ai_keywords:关键词 + 返回值: 包含AI关键词的完整句子列表 + 返回值: 包含AI关键词的列表 + 返回值: 关键词统计 + """ + + ai_sentences = [] # 包含关键词的完整句子 + keyword_only_sentences = [] # 只包含关键词的句子 + keyword_counts = Counter() # 计数器,用于统计每个关键词的频次 + + for bullet in my_bullet: + contains_keyword = False + for keyword in ai_keywords: + if keyword in bullet: + contains_keyword = True + ai_sentences.append(bullet) + keyword_only_sentences.append(keyword) + keyword_counts[keyword] += 1 # 更新关键词计数 + + if contains_keyword: + # 只保留包含的关键词 + keywords_in_bullet = [keyword for keyword in ai_keywords if keyword in bullet] + + return ai_sentences, keyword_only_sentences, keyword_counts + +# 将关键词统计信息保存到Excel文件 +def save_keyword_counts_to_excel(keyword_counts, path='keyword_counts.xlsx'): + """ + 参数1: keyword_counts: Counter对象,包含每个关键词及其出现次数 + 参数2: path: Excel文件名 + """ + counts_df = pd.DataFrame(keyword_counts.items(), columns=["关键词", "出现次数"]) + + # 按出现次数排序,从高到低 + counts_df.sort_values(by='出现次数', ascending=False, inplace=True) + + counts_df.to_excel(path, index=False) + print(f"关键词统计已保存到 {path}") + +# 将包含AI关键词的句子保存到Excel文件 +def save_sentences_to_excel(ai_sentences, keyword_only_sentences,path1='ai_sentences.xlsx',path2='keyword_only_sentences.xlsx',choose1=True ,choose2=True): + """ + 参数1: ai_sentences: 包含AI关键词的句子列表 + 参数2: keyword_only_sentences: 只包含AI关键字的列表 + 参数3: path1: Excel文件名,保存AI关键词句子列表 + 参数4: path2: Excel文件名,保存AI关键词 + 参数5: choose1: 是否保存path1 + 参数6: choose2: 是否保存path2 + """ + + if(choose1): + ai_df = pd.DataFrame(ai_sentences, columns=["包含关键词的弹幕"]) + ai_df.to_excel(path1, index=False) + if(choose2): + keyword_only_df = pd.DataFrame(keyword_only_sentences, columns=["关键词"]) + keyword_only_df.to_excel(path2, index=False) + + print(f"包含关键词的弹幕句子已保存到 {path1}") + print(f"只包含关键词的句子已保存到 {path2}") + +def main_bullet(): + bv_list = get_bv(BV_NUM) + print(bv_list) + + bullet_screens = fetch_bullet_screen(bv_list) + print(f"获取了 {len(bullet_screens)} 条弹幕") + + ai_keywords = [ + "AI", "人工智能", "机器学习", "深度学习", "神经网络", "算法", "智能", "大数据", "自动化", "机器人", + "计算机视觉", "自然语言处理", "NLP", "语音识别", "自动驾驶", "边缘计算", "强化学习", + "生成对抗网络", "GAN", "迁移学习", "数据挖掘", "语义分析", "图像识别", "深度神经网络", "DNN", + "决策树", "随机森林", "集成学习", "模糊逻辑", "专家系统", "计算智能", "大规模并行处理", + "分布式系统", "物联网", "IoT", "云计算", "区块链", "量子计算", "图神经网络", "GNN", + "人机交互", "HCI", "情感分析", "机器人过程自动化", "RPA", "无人机", "UAV", "智能城市", + "云原生", "分布式学习", "元学习", "数字孪生", "自动化运维", "AIOps" + ] + + ai_sentences, keyword_only_sentences, keyword_counts = analyze_bullet_screen_with_ai_sentences(bullet_screens, ai_keywords) + print(f"提取了 {len(ai_sentences)} 条包含AI相关关键词的弹幕") + + save_sentences_to_excel(ai_sentences, keyword_only_sentences, choose2 = False) + save_keyword_counts_to_excel(keyword_counts) + +if __name__ == '__main__': + main_bullet() \ No newline at end of file