You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
193 lines
6.5 KiB
193 lines
6.5 KiB
2 months ago
|
import requests
|
||
|
from bs4 import BeautifulSoup
|
||
|
import time
|
||
|
import random
|
||
|
import openpyxl
|
||
|
from collections import Counter
|
||
|
import jieba
|
||
|
import wordcloud
|
||
|
|
||
|
|
||
|
headers = {
|
||
|
'Connection': 'keep-alive',
|
||
|
'Cache-Control': 'max-age=0',
|
||
|
'DNT': '1',
|
||
|
'Upgrade-Insecure-Requests': '1',
|
||
|
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/82.0.4083.0 Safari/537.36 Edg/82.0.458.0',
|
||
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
|
||
|
'Sec-Fetch-Site': 'none',
|
||
|
'Sec-Fetch-Mode': 'navigate',
|
||
|
'Sec-Fetch-User': '?1',
|
||
|
'Sec-Fetch-Dest': 'document',
|
||
|
'Accept-Language': 'en-US,en;q=0.9'
|
||
|
}
|
||
|
def get_video_ids(api_urls):
|
||
|
video_ids=[]
|
||
|
headers = {
|
||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0',
|
||
|
'cookie' : 'BIDUPSID=B217BCFBC37D845BE7576B36283B7200; PSTM=1694936551; BAIDUID_BFESS=B217BCFBC37D845B41B0963C72424B9E:FG=1; ZFY=8a5uERFpWfnCU59:Bf7xfjug61O89yJEG1n4GF:BPaY1c:C; BDRCVFR[bPTzwF-RsLY]=mk3SLVN4HKm; H_PS_PSSID=60724_60360_60799; BD_HOME=1; BD_UPN=12314753; BA_HECTOR=058ka00hahalak0gak21a1akapkf6v1jelat81u'
|
||
|
}
|
||
|
#获取前300个热门视频bvid并保存
|
||
|
# 设定计数器记录id的个数
|
||
|
cnt = 0
|
||
|
# 从多页数据中获取视频信息
|
||
|
for page in range(1, 22):
|
||
|
# 执行翻页操作
|
||
|
api_url = api_urls + str(page)
|
||
|
# 获取Datas
|
||
|
response = requests.get(api_url, headers=headers)
|
||
|
response.encoding = 'utf-8'
|
||
|
Json = response.json()
|
||
|
Datas = Json['data']['result']
|
||
|
for Data in Datas:
|
||
|
# 通过try except跳过B站设置的断点
|
||
|
try:
|
||
|
bvids = Data['bvid']
|
||
|
video_ids.append(bvids)
|
||
|
with open("bv.txt", mode='a', encoding='utf-8') as f: # 执行写入操作
|
||
|
f.write(bvids + '\n')
|
||
|
cnt += 1
|
||
|
if (cnt >= 300):
|
||
|
break
|
||
|
except:
|
||
|
continue
|
||
|
if (cnt >= 300):
|
||
|
break
|
||
|
# print(bvids)
|
||
|
|
||
|
return video_ids
|
||
|
|
||
|
|
||
|
# API URLs
|
||
|
api_urls = 'https://api.bilibili.com/x/web-interface/search/type?search_type=video&keyword=巴黎奥运会&page='
|
||
|
|
||
|
video_ids = get_video_ids(api_urls)
|
||
|
#从bvid获取单个cid的方法
|
||
|
def get_cid_from_bv(bv_number, p_number=0):
|
||
|
try:
|
||
|
url = 'https://api.bilibili.com/x/player/pagelist?bvid={}&jsonp=jsonp'
|
||
|
response = requests.get(url.format(bv_number), headers=headers)
|
||
|
|
||
|
if response.status_code == 200:
|
||
|
data = response.json()['data']
|
||
|
|
||
|
if p_number < len(data):
|
||
|
return data[p_number]['cid']
|
||
|
else:
|
||
|
print(f'Error: Part number out of range for BV code {bv_number}.')
|
||
|
return None
|
||
|
else:
|
||
|
print(f'Error: Failed to retrieve CID for BV code {bv_number}. Status code: {response.status_code}')
|
||
|
return None
|
||
|
|
||
|
except Exception as e:
|
||
|
print(f'Error: {str(e)} for BV code {bv_number}.')
|
||
|
return None
|
||
|
|
||
|
#批量转换bvid到cid
|
||
|
def get_cids_from_bv_list(video_ids):
|
||
|
cid_list = []
|
||
|
for bv_code in video_ids:
|
||
|
cid = get_cid_from_bv(bv_code)
|
||
|
if cid is not None:
|
||
|
cid_list.append(cid)
|
||
|
else:
|
||
|
cid_list.append(None)
|
||
|
return cid_list
|
||
|
|
||
|
cids = get_cids_from_bv_list(video_ids)
|
||
|
|
||
|
print(f'The corresponding CIDs for the provided BV codes are: {cids}')
|
||
|
|
||
|
#通过cid批量获取弹幕链接
|
||
|
def get_danmu(cids):
|
||
|
url = f'https://api.bilibili.com/x/v1/dm/list.so?oid={cids}'
|
||
|
headers = {
|
||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
|
||
|
}
|
||
|
response = requests.get(url, headers=headers)
|
||
|
response.encoding = 'utf-8'
|
||
|
soup = BeautifulSoup(response.text, 'html.parser')
|
||
|
danmu_list = []
|
||
|
for danmu in soup.find_all('d'):
|
||
|
danmu_list.append(danmu.text)
|
||
|
return danmu_list
|
||
|
|
||
|
# 批量爬取弹幕
|
||
|
with open('all_danmu.txt', 'w', encoding='utf-8') as f:
|
||
|
for video_id in cids:
|
||
|
danmu_list = get_danmu(video_id)
|
||
|
print(f'视频 {video_id} 的弹幕数量: {len(danmu_list)}')
|
||
|
|
||
|
# 写入当前视频的弹幕到文件
|
||
|
f.write(f'== 开始保存视频 {video_id} 的弹幕 ==\n')
|
||
|
f.write('\n'.join(danmu_list) + '\n')
|
||
|
|
||
|
# 随机休眠1-3秒,防止被ban
|
||
|
time.sleep(random.uniform(1, 3))
|
||
|
|
||
|
print('所有视频的弹幕已保存到 all_danmu.txt 文件中')
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
# 1. 读取弹幕数据文件
|
||
|
filename = 'all_danmu.txt'
|
||
|
|
||
|
with open(filename, 'r', encoding='utf-8') as f:
|
||
|
danmus = f.readlines()
|
||
|
|
||
|
# 2. 过滤与AI技术相关的弹幕
|
||
|
ai_keywords = ['AI', '人工智能']
|
||
|
ai_related_danmus = [danmu for danmu in danmus if any(keyword in danmu for keyword in ai_keywords)]
|
||
|
|
||
|
# 3. 统计每种弹幕数量
|
||
|
danmu_counter = Counter(ai_related_danmus)
|
||
|
|
||
|
# 4. 获取数量排名前8的弹幕
|
||
|
top_danmus = danmu_counter.most_common(8)
|
||
|
|
||
|
# 5. 将统计结果写入Excel表
|
||
|
excel_filename = 'AI相关弹幕统计.xlsx'
|
||
|
wb = openpyxl.Workbook()
|
||
|
sheet = wb.active
|
||
|
sheet.title = 'AI相关弹幕统计'
|
||
|
|
||
|
sheet['A1'] = '弹幕内容'
|
||
|
sheet['B1'] = '出现次数'
|
||
|
|
||
|
# 写入前8的弹幕和次数
|
||
|
for idx, (danmu, count) in enumerate(top_danmus, start=2):
|
||
|
sheet[f'A{idx}'] = danmu.strip() # 去除弹幕末尾的换行符
|
||
|
sheet[f'B{idx}'] = count
|
||
|
|
||
|
# 保存Excel文件
|
||
|
wb.save(excel_filename)
|
||
|
print(f'AI相关弹幕统计已保存到 {excel_filename}')
|
||
|
|
||
|
|
||
|
# 读取弹幕数据文件
|
||
|
filename = 'all_danmu.txt'
|
||
|
|
||
|
with open(filename, 'r', encoding='utf-8') as f:
|
||
|
text = f.read()
|
||
|
|
||
|
# 使用 jieba 分词
|
||
|
word_list = jieba.lcut(text)
|
||
|
text_str = ' '.join(word_list)
|
||
|
|
||
|
# 生成词云
|
||
|
wc = wordcloud.WordCloud(font_path='C:/Windows/Fonts/微软雅黑/msyh.ttc', # 指定中文字体文件
|
||
|
width=800, height=600, # 设置词云大小
|
||
|
background_color='white', # 设置背景颜色
|
||
|
stopwords={'哈哈','哈哈哈','的','是','了','我','和','这','也','你','啊','吧','就是','这个','吗','他','不是','真的','都','在','现在','感觉','看','有','不'},
|
||
|
contour_width=1, contour_color='blue')
|
||
|
wc.generate(text_str)
|
||
|
|
||
|
# 保存词云图到当前文件夹
|
||
|
|
||
|
wc.to_file('wordcloud.png')
|
||
|
|
||
|
|