You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

154 lines
7.2 KiB

import requests
import json
import re
import jieba
import time
import wordcloud
import numpy as np
from PIL import Image
import pandas as pd
def get_search(v_keyword, max_videos=300):
video_count = 0 # 记录已爬取的视频数量
page_size = 30 # 每页视频数量
max_page = max_videos // page_size # 计算需要爬取的页数
for page in range(1, max_page + 1):
if video_count >= max_videos:
break
# 请求地址
url = 'https://api.bilibili.com/x/web-interface/wbi/search/type'
# 请求头
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
'Referer': 'https://www.bilibili.com/video/BV1xxxxxxx', # 修改为目标视频的 URL
'Accept': 'application/json, text/plain, */*',
'cookie':'buvid3 = 2C1B627D - C611 - D842 - 783F - 1D6D6205BE8E30149infoc;b_nut = 1716183429;buvid4 = 58C6836B - CD3C - B8D8 - B75E - 014E5D19E71730149 - 024052005 - BX9TPJ1Qd1SMwWPyECcJSQ % 3D % 3D;_uuid = 88852C4D - 2847 - F2A7 - 1043A - FCF7104D7E6DD13761infoc;rpdid = | (JlklRl)~Yu0Ju~uYu)Jlu~; DedeUserID=22863112; DedeUserID__ckMd5=f744f10bf9a83bc2; enable_web_push=DISABLE; header_theme_version=CLOSE; hit-dyn-v2=1; buvid_fp_plain=undefined; LIVE_BUVID=AUTO1617162131622559; CURRENT_FNVAL=4048; fingerprint=3b1cd80e32b9900fd22fdd2e95f8b55f; home_feed_column=5; CURRENT_QUALITY=80; browser_resolution=1488-738; PVID=1; buvid_fp=3b1cd80e32b9900fd22fdd2e95f8b55f; SESSDATA=234b58b7%2C1741842619%2C4f93a%2A91CjAqN6Nk1W9q-NW-cK5kX_jJcu9jtvJEQe4MTO1eHvrS8hz-apd6jYPG1Lm9-_NlfGQSVkZnM1FNa2hGYzl5a1pCbkkzZ2JpQlc1RFRYUTY2N3VlU0Q5ZmVTZ01zLUE3aXVPY0VzcXpGeVNDMVU5dUNsNlYxbHpRYXZab3h0amEyQ011ZFBCdTl3IIEC; bili_jct=4e3c35728fb766db9f42f2eded6a02c2; b_lsid=86E10C228_191F3A74756; bp_t_offset_22863112=977239569625776128'
}
params = {
'__refresh__': 'true',
'page': page,
'page_size': page_size,
'keyword': v_keyword,
'order': 'totalrank',
'search_type': 'video',
'csrf': 'your_bili_jct_value' # 确保这里是从 Cookie 中获取的有效值
}
# 向页面发送请求
r = requests.get(url, headers=headers, params=params)
if r.status_code != 200:
print(f"请求失败,状态码:{r.status_code}")
break
result = r.json().get('data', {}).get('result', [])
if not result:
print(f"{page}页没有结果,停止爬取。")
break
# 处理当前页的搜索结果
for index in result:
if video_count >= max_videos:
break
bv_id = index['bvid']
print(f"正在处理视频: {bv_id}") # 打印视频ID
with open('bv_id_1.txt', mode='a', encoding='utf-8') as f:
f.write(bv_id)
f.write('\n')
# 获取视频弹幕的 cid
cid_url = f'https://api.bilibili.com/x/player/pagelist?bvid={bv_id}&jsonp=jsonp'
response = requests.get(url=cid_url, headers=headers)
if response.status_code != 200:
print(f"获取 cid 失败,状态码:{response.status_code}")
continue
res_dict = json.loads(response.content.decode('utf-8'))
values = res_dict.get('data', [])
if not values:
print(f"没有找到视频 {bv_id} 的 cid。")
continue
for cid_values in values:
cid = str(cid_values.get('cid'))
# 获取弹幕内容
danmu_url = f'https://api.bilibili.com/x/v1/dm/list.so?oid={cid}'
response = requests.get(danmu_url, headers=headers)
if response.status_code != 200:
print(f"获取弹幕失败,状态码:{response.status_code}")
continue
# 提取弹幕文本
content_list = re.findall('<d p=".*?">(.*?)</d>', response.content.decode('utf-8'))
if not content_list:
print(f"视频 {bv_id} 没有弹幕。")
continue
print(f"视频 {bv_id} 的弹幕数量: {len(content_list)}") # 打印弹幕数量
# 保存弹幕文本
with open('弹幕文本.txt', mode='a', encoding='utf-8') as f:
for content in content_list:
f.write(content)
f.write('\n')
video_count += 1
print(f"当前已爬取视频数量: {video_count}")
time.sleep(1) # 添加延迟以减少被封的风险
def get_wordcloud():
"""生成词云图"""
with open('弹幕文本.txt', encoding='utf-8') as f:
txt = f.read().strip() # 读取文件并去除空白字符
if not txt:
print("错误:弹幕文本为空,无法生成词云。")
return
# 切词并生成词云
stopwords = set(["哈哈", "哈哈哈", "哈哈哈哈", "哈哈哈哈哈", "哈哈哈哈哈哈", "哈哈哈哈哈哈哈", "哈哈哈哈哈哈哈哈哈",
"哈哈哈哈哈哈哈哈哈", "哈哈哈哈哈哈哈哈哈哈", "哈哈哈哈哈哈哈哈哈哈哈哈哈", "哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈",
"哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈", "哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈", '', '', '','', '', '', '', '', '', '', '', '', '', '', '这个','不是','真的','我们', '你们', '他们'])
string = ' '.join(jieba.cut(txt))
mask = np.array(Image.open("7.png")) # 使用图片作为遮罩
wc = wordcloud.WordCloud(
background_color='white',
font_path='msyh.ttc',
colormap='Blues',
mask=mask,
stopwords = stopwords
)
wc.generate(string)
wc.to_file('词云图.png')
print("词云图生成成功!")
def get_excel():
"""统计弹幕出现次数并导出Excel"""
with open('弹幕文本.txt', 'r', encoding='utf-8') as file:
lines = file.readlines()
# 去除换行符后统计出现次数
line_counts = pd.Series([line.strip() for line in lines]).value_counts()
df = pd.DataFrame({'弹幕': line_counts.index, '出现次数': line_counts.values})
df.to_excel('结果统计.xlsx', index=False)
def analyze_ai_application():
"""分析AI相关关键词并导出Excel"""
with open('弹幕文本.txt', encoding='utf8') as file:
text = file.read()
ai_keywords = ["智能装备", "数据分析", "虚拟训练", "人工智能", "AR", "3D分析", "自动裁判", "ai","科学","自动","芯片","算法","智能","建模"]
keyword_counts = {kw: text.count(kw) for kw in ai_keywords}
df = pd.DataFrame(sorted(keyword_counts.items(), key=lambda x: x[1], reverse=True), columns=['关键词', '出现次数'])
df.to_excel('关键词统计.xlsx', index=False)
if __name__ == '__main__':
get_search('2024巴黎奥运会', max_videos=300)
get_wordcloud()
get_excel()
analyze_ai_application()