|
|
@ -0,0 +1,171 @@
|
|
|
|
|
|
|
|
from audioop import avgpp
|
|
|
|
|
|
|
|
import requests
|
|
|
|
|
|
|
|
import time
|
|
|
|
|
|
|
|
import re, os
|
|
|
|
|
|
|
|
import jieba
|
|
|
|
|
|
|
|
from wordcloud import WordCloud
|
|
|
|
|
|
|
|
from imageio import imread
|
|
|
|
|
|
|
|
from collections import Counter
|
|
|
|
|
|
|
|
import matplotlib.pyplot as plt
|
|
|
|
|
|
|
|
from PIL import Image
|
|
|
|
|
|
|
|
import jieba
|
|
|
|
|
|
|
|
import numpy as np
|
|
|
|
|
|
|
|
#通过cid获取弹幕
|
|
|
|
|
|
|
|
def spider_page(cid):
|
|
|
|
|
|
|
|
url = f'http://comment.bilibili.com/{
|
|
|
|
|
|
|
|
cid}.xml'
|
|
|
|
|
|
|
|
headers = {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
'referer': 'xxxxx',
|
|
|
|
|
|
|
|
'User-Agent': 'xxxxx',
|
|
|
|
|
|
|
|
'cookie': "xxxxx"
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
resp = requests.get(url, headers=headers)
|
|
|
|
|
|
|
|
resp.encoding = resp.apparent_encoding
|
|
|
|
|
|
|
|
print(resp.text)
|
|
|
|
|
|
|
|
if resp.status_code == 200:
|
|
|
|
|
|
|
|
# 获取所有弹幕内容
|
|
|
|
|
|
|
|
content_list = re.findall('<d p=".*?">(.*?)</d>', resp.text)
|
|
|
|
|
|
|
|
for item in content_list:
|
|
|
|
|
|
|
|
with open(comment_file_path, 'a', encoding='utf-8') as fin:
|
|
|
|
|
|
|
|
fin.write(item + '\n')
|
|
|
|
|
|
|
|
print(item)
|
|
|
|
|
|
|
|
print('-------------弹幕获取完毕!-------------')
|
|
|
|
|
|
|
|
#查找cid
|
|
|
|
|
|
|
|
def extract_cid_number(text):
|
|
|
|
|
|
|
|
# 定义正则表达式模式
|
|
|
|
|
|
|
|
pattern = r'cid:(\d+)'
|
|
|
|
|
|
|
|
match = re.search(pattern, text)
|
|
|
|
|
|
|
|
if match:
|
|
|
|
|
|
|
|
# 如果找到匹配项,返回匹配的数字部分
|
|
|
|
|
|
|
|
return match.group(1) # group(1) 表示第一个捕获组
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
# 如果没有找到匹配项,返回None
|
|
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
# 爬取300个bv号
|
|
|
|
|
|
|
|
for page in range(1,11):
|
|
|
|
|
|
|
|
url = ("https://api.bilibili.com/x/web-interface/wbi/search/type")
|
|
|
|
|
|
|
|
headers = {
|
|
|
|
|
|
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0',
|
|
|
|
|
|
|
|
'cookie':"buvid4=1AD93210-06E5-365A-934E-75A291B6908009721-022061720-eVf9MUhvco5nm9yolfHaMw%3D%3D; buvid_fp_plain=undefined; LIVE_BUVID=AUTO1416573595037363; CURRENT_FNVAL=4048; header_theme_version=CLOSE; enable_web_push=DISABLE; PVID=1; rpdid=|(u))kkYuu|l0J'u~|J~Yu|)J; DedeUserID=355329678; DedeUserID__ckMd5=5e429bd71d91fb47; FEED_LIVE_VERSION=V_WATCHLATER_PIP_WINDOW3; _uuid=6FF33841-3A610-8313-10ED8-3AE107622425A15607infoc; buvid3=3FA03EE9-E5EA-3463-0D0B-073E3807CA0620579infoc; b_nut=1719137316; fingerprint=d1e7fdeb59c00ae7dcb9837e214f09e9; hit-dyn-v2=1; CURRENT_QUALITY=112; buvid_fp=d1e7fdeb59c00ae7dcb9837e214f09e9; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY4MjgwODUsImlhdCI6MTcyNjU2ODgyNSwicGx0IjotMX0.d1aocPfSyHx3EgbchHgcdqtxN94d5619BcEPlXsG5Jw; bili_ticket_expires=1726828025; SESSDATA=8b1a6c86%2C1742204953%2C96da2%2A92CjBnKA1rMd20hx2CampWCl9FHGYIlzAI-IoQMscqxdn8LQwXeAP7HKkOso7RLRlh4CYSVkZQUFBoQ2RzNno5QVRPcEU2LXdrZG9Qd0luRE11STBlbmoyN1l6cl91cTM1Wk9FclU2cDB4NDFnYVNIQ01Fai1qS2dRd0xGLWFIbUNkNXV2elowWGhBIIEC; bili_jct=72a134de4233a184cb823906d856daeb; b_lsid=79C7A3B2_19204B808B2; bsource=search_bing; home_feed_column=5; browser_resolution=1659-836; bp_t_offset_355329678=978470992584114176"
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
params = {
|
|
|
|
|
|
|
|
'category_id': '',
|
|
|
|
|
|
|
|
'search_type': 'video',
|
|
|
|
|
|
|
|
'ad_resource': 5654,
|
|
|
|
|
|
|
|
'__refresh__': 'true',
|
|
|
|
|
|
|
|
'context': '',
|
|
|
|
|
|
|
|
'page': page,
|
|
|
|
|
|
|
|
'page_size': 30,
|
|
|
|
|
|
|
|
'pubtime_begin_s': 0,
|
|
|
|
|
|
|
|
'pubtime_end_s': 0,
|
|
|
|
|
|
|
|
'from_source': '',
|
|
|
|
|
|
|
|
'from_spmid': '333.337',
|
|
|
|
|
|
|
|
'platform': 'pc',
|
|
|
|
|
|
|
|
'highlight': 1,
|
|
|
|
|
|
|
|
'single_column': 0,
|
|
|
|
|
|
|
|
'keyword': '2024巴黎奥运会',
|
|
|
|
|
|
|
|
'qv_id': 's5p9ZoGL8W8aU7TP3gWJ1xLzPA6njovt',
|
|
|
|
|
|
|
|
'source_tag': 3,
|
|
|
|
|
|
|
|
'gaia_vtoken': '',
|
|
|
|
|
|
|
|
'dynamic_offset': 24,
|
|
|
|
|
|
|
|
'web_location': 1430654,
|
|
|
|
|
|
|
|
'w_rid': '5475898fdbb1cc8a359f93dd5826e3f9',
|
|
|
|
|
|
|
|
'wts': 1726221162
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
response = requests.get(url,headers=headers, params=params)
|
|
|
|
|
|
|
|
print(response.text)
|
|
|
|
|
|
|
|
it = re.finditer(r'"bvid":"(BV[\d\w]{10})"',response.text)
|
|
|
|
|
|
|
|
with open('bvnumbers.txt', 'a', encoding='utf-8') as file:
|
|
|
|
|
|
|
|
for i in it:
|
|
|
|
|
|
|
|
bv = i.group(1)
|
|
|
|
|
|
|
|
file.write(bv + '\n') # 每个BV号独占一行
|
|
|
|
|
|
|
|
time.sleep(2)
|
|
|
|
|
|
|
|
response.close()
|
|
|
|
|
|
|
|
comment_file_path = 'B站弹幕.csv'
|
|
|
|
|
|
|
|
output_file = 'cids.txt'#cid的存储文件
|
|
|
|
|
|
|
|
api_base_url="https://api.bilibili.com/x/player/pagelist?bvid="
|
|
|
|
|
|
|
|
headers = {
|
|
|
|
|
|
|
|
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.1.4322; MS-RTC LM 8; InfoPath.2; Tablet PC 2.0)',
|
|
|
|
|
|
|
|
'cookie':"buvid4=1AD93210-06E5-365A-934E-75A291B6908009721-022061720-eVf9MUhvco5nm9yolfHaMw%3D%3D; buvid_fp_plain=undefined; LIVE_BUVID=AUTO1416573595037363; CURRENT_FNVAL=4048; header_theme_version=CLOSE; enable_web_push=DISABLE; PVID=1; rpdid=|(u))kkYuu|l0J'u~|J~Yu|)J; DedeUserID=355329678; DedeUserID__ckMd5=5e429bd71d91fb47; FEED_LIVE_VERSION=V_WATCHLATER_PIP_WINDOW3; _uuid=6FF33841-3A610-8313-10ED8-3AE107622425A15607infoc; buvid3=3FA03EE9-E5EA-3463-0D0B-073E3807CA0620579infoc; b_nut=1719137316; fingerprint=d1e7fdeb59c00ae7dcb9837e214f09e9; hit-dyn-v2=1; CURRENT_QUALITY=112; buvid_fp=d1e7fdeb59c00ae7dcb9837e214f09e9; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY4MjgwODUsImlhdCI6MTcyNjU2ODgyNSwicGx0IjotMX0.d1aocPfSyHx3EgbchHgcdqtxN94d5619BcEPlXsG5Jw; bili_ticket_expires=1726828025; SESSDATA=8b1a6c86%2C1742204953%2C96da2%2A92CjBnKA1rMd20hx2CampWCl9FHGYIlzAI-IoQMscqxdn8LQwXeAP7HKkOso7RLRlh4CYSVkZQUFBoQ2RzNno5QVRPcEU2LXdrZG9Qd0luRE11STBlbmoyN1l6cl91cTM1Wk9FclU2cDB4NDFnYVNIQ01Fai1qS2dRd0xGLWFIbUNkNXV2elowWGhBIIEC; bili_jct=72a134de4233a184cb823906d856daeb; bp_t_offset_355329678=978467990401974272; b_lsid=79C7A3B2_19204B808B2; bsource=search_bing; home_feed_column=5; browser_resolution=1659-836"
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
with open('extracted_bv_numbers.txt', 'r', encoding='utf-8') as file:
|
|
|
|
|
|
|
|
bvnumbers = file.read().splitlines()
|
|
|
|
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as file:
|
|
|
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
with open(output_file, 'a', encoding='utf-8') as file:
|
|
|
|
|
|
|
|
for bvid in bvnumbers:#循环获取弹幕
|
|
|
|
|
|
|
|
api_url = f"{api_base_url}{bvid}"
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
|
|
response = requests.get(api_url,headers=headers)
|
|
|
|
|
|
|
|
if response.status_code == 200:
|
|
|
|
|
|
|
|
data = response.json()
|
|
|
|
|
|
|
|
# 确保数据结构是预期的形式
|
|
|
|
|
|
|
|
if 'data' in data and isinstance(data['data'], list):
|
|
|
|
|
|
|
|
for page in data['data']:
|
|
|
|
|
|
|
|
cid = page.get('cid')
|
|
|
|
|
|
|
|
if cid is not None:
|
|
|
|
|
|
|
|
spider_page(cid)
|
|
|
|
|
|
|
|
f = open('B站弹幕.csv', encoding='utf-8')
|
|
|
|
|
|
|
|
f.close()
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
print(f"请求失败,状态码: {response.status_code}")
|
|
|
|
|
|
|
|
time.sleep(1)
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
|
|
print(f"处理BV号 {bvid} 时发生错误: {e}")
|
|
|
|
|
|
|
|
time.sleep(1)#休息一下
|
|
|
|
|
|
|
|
print("所有BV号已处理完毕。")
|
|
|
|
|
|
|
|
# 定义与2024年巴黎奥运会赛事应用AI技术相关的关键词
|
|
|
|
|
|
|
|
ai_olympic_keywords = [
|
|
|
|
|
|
|
|
"巴黎奥运会AI", "AI技术", "智能", "子弹时间", "AI增强", "赛事转播", "辅助训练", "AI回放系统",
|
|
|
|
|
|
|
|
"3d模型", "沉浸式虚拟重建","AI重塑","运动捕捉","特效渲染","AI修复","AI","人工智能"
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
# 字典存储包含关键词的弹幕及其出现次数
|
|
|
|
|
|
|
|
danmu_counts = Counter()
|
|
|
|
|
|
|
|
output_file_path = '包含关键词的弹幕.txt'
|
|
|
|
|
|
|
|
input_file_path = 'B站弹幕.csv'
|
|
|
|
|
|
|
|
# 打开弹幕文件并读取内容
|
|
|
|
|
|
|
|
with open('B站弹幕.csv', mode='r', encoding='utf-8') as file:
|
|
|
|
|
|
|
|
# 遍历每一行数据
|
|
|
|
|
|
|
|
for line in file:
|
|
|
|
|
|
|
|
danmu = line.strip() # 去除行尾的换行符
|
|
|
|
|
|
|
|
# 检查弹幕是否包含任何一个关键词
|
|
|
|
|
|
|
|
for keyword in ai_olympic_keywords:
|
|
|
|
|
|
|
|
if keyword in danmu:
|
|
|
|
|
|
|
|
danmu_counts[danmu] += 1
|
|
|
|
|
|
|
|
break # 一旦找到关键词,就跳过剩余的关键词检查
|
|
|
|
|
|
|
|
# 获取出现次数最多的前8条弹幕
|
|
|
|
|
|
|
|
# 打开输入文件以读取内容,打开输出文件以写入内容
|
|
|
|
|
|
|
|
with open(input_file_path, mode='r', encoding='utf-8') as input_file, \
|
|
|
|
|
|
|
|
open(output_file_path, mode='w', encoding='utf-8') as output_file:
|
|
|
|
|
|
|
|
# 遍历输入文件的每一行
|
|
|
|
|
|
|
|
for line in input_file:
|
|
|
|
|
|
|
|
danmu = line.strip() # 去除行尾的换行符
|
|
|
|
|
|
|
|
# 检查弹幕是否包含任何一个关键词
|
|
|
|
|
|
|
|
if any(keyword in danmu for keyword in ai_olympic_keywords):
|
|
|
|
|
|
|
|
# 如果包含,就写入输出文件
|
|
|
|
|
|
|
|
output_file.write(danmu + '\n') # 添加换行符以便每条弹幕占一行
|
|
|
|
|
|
|
|
top_8_danmus = danmu_counts.most_common(8)
|
|
|
|
|
|
|
|
# 将结果保存到文件
|
|
|
|
|
|
|
|
with open('top_ai_olympic_danmus.txt', 'w', encoding='utf-8') as output:
|
|
|
|
|
|
|
|
for danmu, count in top_8_danmus:
|
|
|
|
|
|
|
|
output.write(f"{danmu}\n")
|
|
|
|
|
|
|
|
f = open('包含关键词的弹幕.txt',encoding='utf-8')
|
|
|
|
|
|
|
|
text = f.read()
|
|
|
|
|
|
|
|
text_list = jieba.lcut(text)
|
|
|
|
|
|
|
|
print(text_list)
|
|
|
|
|
|
|
|
text_str = ''.join(text_list)
|
|
|
|
|
|
|
|
print(text_str)
|
|
|
|
|
|
|
|
#生成词云图
|
|
|
|
|
|
|
|
wc = WordCloud(
|
|
|
|
|
|
|
|
width=2000,
|
|
|
|
|
|
|
|
height=1000,
|
|
|
|
|
|
|
|
background_color='white',
|
|
|
|
|
|
|
|
font_path='C:\Windows\Fonts\SIMLI.TTF',
|
|
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
wc.generate(text_str)
|
|
|
|
|
|
|
|
wc.to_file('ciyun.png')
|
|
|
|
|
|
|
|
# 输出结果
|
|
|
|
|
|
|
|
print("出现次数最多的前8条关于2024巴黎奥运会赛事应用AI技术的弹幕:")
|
|
|
|
|
|
|
|
for danmu, count in top_8_danmus:
|
|
|
|
|
|
|
|
print(f"'{danmu}' 出现了 {count} 次")
|