diff --git a/MyBullet.py b/MyBullet.py deleted file mode 100644 index bbb7766..0000000 --- a/MyBullet.py +++ /dev/null @@ -1,162 +0,0 @@ -import re -import requests -from bs4 import BeautifulSoup -from collections import Counter -import pandas as pd -import time - -# 定义需要爬取的视频数量和搜索内容 -BV_NUM = 100 # 需要获取的视频数量 -SEARCH_CONTENT = "吉伊卡哇chiikawa" # 搜索关键词 - -# 请求头,防止反爬 -HEADERS = { - 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 Edg/126.0.0.0', - "Referer": "https://search.bilibili.com/all?" -} - -# 根据搜索关键词,爬取指定数量的视频BVID -def get_bv(num): - """ - 参数: num: 需要获取的BVID数量 - 返回值: bv_list, 一个包含视频BVID的集合 - """ - bv_list = set() # 使用集合存储获取的BVID,以去重 - page = 1 # 初始化页码 - - while len(bv_list) < num: - # 构造搜索页面的URL - search_url = f"https://search.bilibili.com/all?keyword={SEARCH_CONTENT}&page={page}" - response = requests.get(search_url, headers=HEADERS) - - # 使用正则表达式提取BVID - pattern = re.compile(r'aid:.*?bvid:"(?P.*?)",') - matches = pattern.finditer(response.text) - - # 将BVID加入集合 - for match in matches: - bv_list.add(match.group("bvs")) - # 如果达到了指定数量,直接返回结果 - if len(bv_list) >= num: - return bv_list - - # 增加页码,继续爬取下一页 - page += 1 - - return bv_list - -# 通过bv号获取视频cid,进一步获取弹幕内容 -def fetch_bullet_screen(bv_list): - """ - 参数 bv_list: 包含BV号的列表 - 返回值: 弹幕内容列表 - """ - my_bullet = [] # 存放所有弹幕的列表 - for bv in bv_list: - cid_url = f"https://api.bilibili.com/x/player/pagelist?bvid={bv}&jsonp=jsonp" - response_cid = requests.get(cid_url, headers=HEADERS) - response_cid_json = response_cid.json() - - # 获取视频的cid - cid = response_cid_json.get("data", [{}])[0].get("cid") - if not cid: - print(f"无法获取 {bv} 的cid") - continue - - # 获取对应cid的弹幕内容 - response_bullet = requests.get(f"https://comment.bilibili.com/{cid}.xml", headers=HEADERS) - response_bullet.encoding = "utf-8" - - # 解析XML格式的弹幕数据 - soup = BeautifulSoup(response_bullet.text, "xml") - danmus = soup.find_all("d") - - # 将弹幕内容添加到列表中 - my_bullet.extend([danmu.text for danmu in danmus]) - print(f"已成功爬取 {bv} 的弹幕") - - return my_bullet - -# 分析弹幕内容,提取包含AI相关词语的完整句子 -def analyze_bullet_screen_with_ai_sentences(my_bullet , ai_keywords): - """ - 参数1: my_bullet: 弹幕内容列表 - 参数2: ai_keywords:关键词 - 返回值: 包含AI关键词的完整句子列表 - 返回值: 包含AI关键词的列表 - 返回值: 关键词统计 - """ - - ai_sentences = [] # 包含关键词的完整句子 - keyword_only_sentences = [] # 只包含关键词的句子 - keyword_counts = Counter() # 计数器,用于统计每个关键词的频次 - - for bullet in my_bullet: - contains_keyword = False - for keyword in ai_keywords: - if keyword in bullet: - contains_keyword = True - ai_sentences.append(bullet) - keyword_only_sentences.append(keyword) - keyword_counts[keyword] += 1 # 更新关键词计数 - - if contains_keyword: - # 只保留包含的关键词 - keywords_in_bullet = [keyword for keyword in ai_keywords if keyword in bullet] - - return ai_sentences, keyword_only_sentences, keyword_counts - -# 将关键词统计信息保存到Excel文件 -def save_keyword_counts_to_excel(keyword_counts, path='./Chiikawa/keyword_counts.xlsx'): - """ - 参数1: keyword_counts: Counter对象,包含每个关键词及其出现次数 - 参数2: path: Excel文件名 - """ - counts_df = pd.DataFrame(keyword_counts.items(), columns=["关键词", "出现次数"]) - - # 按出现次数排序,从高到低 - counts_df.sort_values(by='出现次数', ascending=False, inplace=True) - - counts_df.to_excel(path, index=False) - print(f"关键词统计已保存到 {path}") - -# 将包含AI关键词的句子保存到Excel文件 -def save_sentences_to_excel(ai_sentences, keyword_only_sentences,path1='./Chiikawa/sentences.xlsx',path2='./Chiikawa/keyword_only_sentences.xlsx',choose1=True ,choose2=True): - """ - 参数1: ai_sentences: 包含AI关键词的句子列表 - 参数2: keyword_only_sentences: 只包含AI关键字的列表 - 参数3: path1: Excel文件名,保存AI关键词句子列表 - 参数4: path2: Excel文件名,保存AI关键词 - 参数5: choose1: 是否保存path1 - 参数6: choose2: 是否保存path2 - """ - - if(choose1): - ai_df = pd.DataFrame(ai_sentences, columns=["包含关键词的弹幕"]) - ai_df.to_excel(path1, index=False) - if(choose2): - keyword_only_df = pd.DataFrame(keyword_only_sentences, columns=["关键词"]) - keyword_only_df.to_excel(path2, index=False) - - print(f"包含关键词的弹幕句子已保存到 {path1}") - print(f"只包含关键词的句子已保存到 {path2}") - -def main_bullet(): - bv_list = get_bv(BV_NUM) - # print(bv_list) - - bullet_screens = fetch_bullet_screen(bv_list) - print(f"获取了 {len(bullet_screens)} 条弹幕") - - ai_keywords = [ - "537", "小八", "乌萨奇", "吉伊",'飞鼠','手工凯','可爱','呀哈' - ] - - ai_sentences, keyword_only_sentences, keyword_counts = analyze_bullet_screen_with_ai_sentences(bullet_screens, ai_keywords) - print(f"提取了 {len(ai_sentences)} 条包含相关关键词的弹幕") - - save_sentences_to_excel(ai_sentences, keyword_only_sentences, choose2 = False) - save_keyword_counts_to_excel(keyword_counts) - -if __name__ == '__main__': - main_bullet() \ No newline at end of file