You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

193 lines
6.5 KiB

import requests
from bs4 import BeautifulSoup
import time
import random
import openpyxl
from collections import Counter
import jieba
import wordcloud
headers = {
'Connection': 'keep-alive',
'Cache-Control': 'max-age=0',
'DNT': '1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/82.0.4083.0 Safari/537.36 Edg/82.0.458.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-User': '?1',
'Sec-Fetch-Dest': 'document',
'Accept-Language': 'en-US,en;q=0.9'
}
def get_video_ids(api_urls):
video_ids=[]
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0',
'cookie' : 'BIDUPSID=B217BCFBC37D845BE7576B36283B7200; PSTM=1694936551; BAIDUID_BFESS=B217BCFBC37D845B41B0963C72424B9E:FG=1; ZFY=8a5uERFpWfnCU59:Bf7xfjug61O89yJEG1n4GF:BPaY1c:C; BDRCVFR[bPTzwF-RsLY]=mk3SLVN4HKm; H_PS_PSSID=60724_60360_60799; BD_HOME=1; BD_UPN=12314753; BA_HECTOR=058ka00hahalak0gak21a1akapkf6v1jelat81u'
}
#获取前300个热门视频bvid并保存
# 设定计数器记录id的个数
cnt = 0
# 从多页数据中获取视频信息
for page in range(1, 22):
# 执行翻页操作
api_url = api_urls + str(page)
# 获取Datas
response = requests.get(api_url, headers=headers)
response.encoding = 'utf-8'
Json = response.json()
Datas = Json['data']['result']
for Data in Datas:
# 通过try except跳过B站设置的断点
try:
bvids = Data['bvid']
video_ids.append(bvids)
with open("bv.txt", mode='a', encoding='utf-8') as f: # 执行写入操作
f.write(bvids + '\n')
cnt += 1
if (cnt >= 300):
break
except:
continue
if (cnt >= 300):
break
# print(bvids)
return video_ids
# API URLs
api_urls = 'https://api.bilibili.com/x/web-interface/search/type?search_type=video&keyword=巴黎奥运会&page='
video_ids = get_video_ids(api_urls)
#从bvid获取单个cid的方法
def get_cid_from_bv(bv_number, p_number=0):
try:
url = 'https://api.bilibili.com/x/player/pagelist?bvid={}&jsonp=jsonp'
response = requests.get(url.format(bv_number), headers=headers)
if response.status_code == 200:
data = response.json()['data']
if p_number < len(data):
return data[p_number]['cid']
else:
print(f'Error: Part number out of range for BV code {bv_number}.')
return None
else:
print(f'Error: Failed to retrieve CID for BV code {bv_number}. Status code: {response.status_code}')
return None
except Exception as e:
print(f'Error: {str(e)} for BV code {bv_number}.')
return None
#批量转换bvid到cid
def get_cids_from_bv_list(video_ids):
cid_list = []
for bv_code in video_ids:
cid = get_cid_from_bv(bv_code)
if cid is not None:
cid_list.append(cid)
else:
cid_list.append(None)
return cid_list
cids = get_cids_from_bv_list(video_ids)
print(f'The corresponding CIDs for the provided BV codes are: {cids}')
#通过cid批量获取弹幕链接
def get_danmu(cids):
url = f'https://api.bilibili.com/x/v1/dm/list.so?oid={cids}'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}
response = requests.get(url, headers=headers)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'html.parser')
danmu_list = []
for danmu in soup.find_all('d'):
danmu_list.append(danmu.text)
return danmu_list
# 批量爬取弹幕
with open('all_danmu.txt', 'w', encoding='utf-8') as f:
for video_id in cids:
danmu_list = get_danmu(video_id)
print(f'视频 {video_id} 的弹幕数量: {len(danmu_list)}')
# 写入当前视频的弹幕到文件
f.write(f'== 开始保存视频 {video_id} 的弹幕 ==\n')
f.write('\n'.join(danmu_list) + '\n')
# 随机休眠1-3秒,防止被ban
time.sleep(random.uniform(1, 3))
print('所有视频的弹幕已保存到 all_danmu.txt 文件中')
# 1. 读取弹幕数据文件
filename = 'all_danmu.txt'
with open(filename, 'r', encoding='utf-8') as f:
danmus = f.readlines()
# 2. 过滤与AI技术相关的弹幕
ai_keywords = ['AI', '人工智能']
ai_related_danmus = [danmu for danmu in danmus if any(keyword in danmu for keyword in ai_keywords)]
# 3. 统计每种弹幕数量
danmu_counter = Counter(ai_related_danmus)
# 4. 获取数量排名前8的弹幕
top_danmus = danmu_counter.most_common(8)
# 5. 将统计结果写入Excel表
excel_filename = 'AI相关弹幕统计.xlsx'
wb = openpyxl.Workbook()
sheet = wb.active
sheet.title = 'AI相关弹幕统计'
sheet['A1'] = '弹幕内容'
sheet['B1'] = '出现次数'
# 写入前8的弹幕和次数
for idx, (danmu, count) in enumerate(top_danmus, start=2):
sheet[f'A{idx}'] = danmu.strip() # 去除弹幕末尾的换行符
sheet[f'B{idx}'] = count
# 保存Excel文件
wb.save(excel_filename)
print(f'AI相关弹幕统计已保存到 {excel_filename}')
# 读取弹幕数据文件
filename = 'all_danmu.txt'
with open(filename, 'r', encoding='utf-8') as f:
text = f.read()
# 使用 jieba 分词
word_list = jieba.lcut(text)
text_str = ' '.join(word_list)
# 生成词云
wc = wordcloud.WordCloud(font_path='C:/Windows/Fonts/微软雅黑/msyh.ttc', # 指定中文字体文件
width=800, height=600, # 设置词云大小
background_color='white', # 设置背景颜色
stopwords={'哈哈','哈哈哈','','','','','','','','','','','就是','这个','','','不是','真的','','','现在','感觉','','',''},
contour_width=1, contour_color='blue')
wc.generate(text_str)
# 保存词云图到当前文件夹
wc.to_file('wordcloud.png')