You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

167 lines
6.9 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import re
import requests
from bs4 import BeautifulSoup
from collections import Counter
import pandas as pd
# 定义需要爬取的视频数量和搜索内容
BV_NUM = 300 # 需要获取的视频数量
SEARCH_CONTENT = "2024巴黎奥运会" # 搜索关键词
# 请求头,防止反爬
HEADERS = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 Edg/126.0.0.0',
"Referer": "https://search.bilibili.com/all?"
}
# 根据搜索关键词爬取指定数量的视频BVID
def get_bv(num):
"""
参数: num: 需要获取的BVID数量
返回值: bv_list, 一个包含视频BVID的集合
"""
bv_list = set() # 使用集合存储获取的BVID以去重
page = 1 # 初始化页码
while len(bv_list) < num:
# 构造搜索页面的URL
search_url = f"https://search.bilibili.com/all?keyword={SEARCH_CONTENT}&page={page}"
response = requests.get(search_url, headers=HEADERS)
# 使用正则表达式提取BVID
pattern = re.compile(r'aid:.*?bvid:"(?P<bvs>.*?)",')
matches = pattern.finditer(response.text)
# 将BVID加入集合
for match in matches:
bv_list.add(match.group("bvs"))
# 如果达到了指定数量,直接返回结果
if len(bv_list) >= num:
return bv_list
# 增加页码,继续爬取下一页
page += 1
return bv_list
# 通过bv号获取视频cid进一步获取弹幕内容
def fetch_bullet_screen(bv_list):
"""
参数 bv_list: 包含BV号的列表
返回值: 弹幕内容列表
"""
my_bullet = [] # 存放所有弹幕的列表
for bv in bv_list:
cid_url = f"https://api.bilibili.com/x/player/pagelist?bvid={bv}&jsonp=jsonp"
response_cid = requests.get(cid_url, headers=HEADERS)
response_cid_json = response_cid.json()
# 获取视频的cid
cid = response_cid_json.get("data", [{}])[0].get("cid")
if not cid:
print(f"无法获取 {bv} 的cid")
continue
# 获取对应cid的弹幕内容
response_bullet = requests.get(f"https://comment.bilibili.com/{cid}.xml", headers=HEADERS)
response_bullet.encoding = "utf-8"
# 解析XML格式的弹幕数据
soup = BeautifulSoup(response_bullet.text, "xml")
danmus = soup.find_all("d")
# 将弹幕内容添加到列表中
my_bullet.extend([danmu.text for danmu in danmus])
print(f"已成功爬取 {bv} 的弹幕")
return my_bullet
# 分析弹幕内容提取包含AI相关词语的完整句子
def analyze_bullet_screen_with_ai_sentences(my_bullet , ai_keywords):
"""
参数1 my_bullet: 弹幕内容列表
参数2 ai_keywords:关键词
返回值: 包含AI关键词的完整句子列表
返回值: 包含AI关键词的列表
返回值: 关键词统计
"""
ai_sentences = [] # 包含关键词的完整句子
keyword_only_sentences = [] # 只包含关键词的句子
keyword_counts = Counter() # 计数器,用于统计每个关键词的频次
for bullet in my_bullet:
contains_keyword = False
for keyword in ai_keywords:
if keyword in bullet:
contains_keyword = True
ai_sentences.append(bullet)
keyword_only_sentences.append(keyword)
keyword_counts[keyword] += 1 # 更新关键词计数
if contains_keyword:
# 只保留包含的关键词
keywords_in_bullet = [keyword for keyword in ai_keywords if keyword in bullet]
return ai_sentences, keyword_only_sentences, keyword_counts
# 将关键词统计信息保存到Excel文件
def save_keyword_counts_to_excel(keyword_counts, path='keyword_counts.xlsx'):
"""
参数1: keyword_counts: Counter对象包含每个关键词及其出现次数
参数2: path: Excel文件名
"""
counts_df = pd.DataFrame(keyword_counts.items(), columns=["关键词", "出现次数"])
# 按出现次数排序,从高到低
counts_df.sort_values(by='出现次数', ascending=False, inplace=True)
counts_df.to_excel(path, index=False)
print(f"关键词统计已保存到 {path}")
# 将包含AI关键词的句子保存到Excel文件
def save_sentences_to_excel(ai_sentences, keyword_only_sentences,path1='ai_sentences.xlsx',path2='keyword_only_sentences.xlsx',choose1=True ,choose2=True):
"""
参数1: ai_sentences: 包含AI关键词的句子列表
参数2: keyword_only_sentences: 只包含AI关键字的列表
参数3: path1: Excel文件名,保存AI关键词句子列表
参数4: path2: Excel文件名,保存AI关键词
参数5: choose1: 是否保存path1
参数6: choose2: 是否保存path2
"""
if(choose1):
ai_df = pd.DataFrame(ai_sentences, columns=["包含关键词的弹幕"])
ai_df.to_excel(path1, index=False)
if(choose2):
keyword_only_df = pd.DataFrame(keyword_only_sentences, columns=["关键词"])
keyword_only_df.to_excel(path2, index=False)
print(f"包含关键词的弹幕句子已保存到 {path1}")
print(f"只包含关键词的句子已保存到 {path2}")
def main_bullet():
bv_list = get_bv(BV_NUM)
print(bv_list)
bullet_screens = fetch_bullet_screen(bv_list)
print(f"获取了 {len(bullet_screens)} 条弹幕")
ai_keywords = [
"AI", "人工智能", "机器学习", "深度学习", "神经网络", "算法", "智能", "大数据", "自动化", "机器人",
"计算机视觉", "自然语言处理", "NLP", "语音识别", "自动驾驶", "边缘计算", "强化学习",
"生成对抗网络", "GAN", "迁移学习", "数据挖掘", "语义分析", "图像识别", "深度神经网络", "DNN",
"决策树", "随机森林", "集成学习", "模糊逻辑", "专家系统", "计算智能", "大规模并行处理",
"分布式系统", "物联网", "IoT", "云计算", "区块链", "量子计算", "图神经网络", "GNN",
"人机交互", "HCI", "情感分析", "机器人过程自动化", "RPA", "无人机", "UAV", "智能城市",
"云原生", "分布式学习", "元学习", "数字孪生", "自动化运维", "AIOps"
]
ai_sentences, keyword_only_sentences, keyword_counts = analyze_bullet_screen_with_ai_sentences(bullet_screens, ai_keywords)
print(f"提取了 {len(ai_sentences)} 条包含AI相关关键词的弹幕")
save_sentences_to_excel(ai_sentences, keyword_only_sentences, choose2 = False)
save_keyword_counts_to_excel(keyword_counts)
if __name__ == '__main__':
main_bullet()