|
|
import requests
|
|
|
import re
|
|
|
import warnings
|
|
|
import json
|
|
|
import jieba
|
|
|
import numpy as np
|
|
|
from wordcloud import WordCloud, ImageColorGenerator
|
|
|
import matplotlib.pyplot as plt
|
|
|
from PIL import Image
|
|
|
import pandas as pd
|
|
|
from collections import Counter
|
|
|
|
|
|
cnt = 1
|
|
|
headers = {
|
|
|
'cookie': 'b_nut=1659613422; buvid3=6C07DC9F-EE29-7F28-2B63-1BF4ECD504A422941infoc; '
|
|
|
'CURRENT_FNVAL=4048; header_theme_version=CLOSE; '
|
|
|
'buvid4=92532619-00E5-BF92-443B-595CD15DE59481123-023013113-97xIUW%2FWJtRnoJI8Rbvu4Q%3D%3D;'
|
|
|
' enable_web_push=DISABLE; rpdid=|(u))kkYu|J|0J\'u~u|)u)RR); '
|
|
|
'hit-dyn-v2=1; FEED_LIVE_VERSION=V_WATCHLATER_PIP_WINDOW3; '
|
|
|
'LIVE_BUVID=AUTO2617189721183630; PVID=1; buvid_fp_plain=undefined; '
|
|
|
'CURRENT_QUALITY=80; _uuid=8108A2C6D-A7AD-7F210-B10E5-EA35A5B47DA391233infoc; '
|
|
|
'home_feed_column=5; browser_resolution=1545-857; '
|
|
|
'bsource=search_bing; fingerprint=0c7279b7c69b9542a76b8d9df9b7872a; '
|
|
|
'buvid_fp=0c7279b7c69b9542a76b8d9df9b7872a; '
|
|
|
'bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjU0NTE2MTEsImlhdCI6MTcyNTE5MjM1MSwicGx0IjotMX0.9HAkh-aLUFL3i2asyrGNSGwvZnlCdO1qHnr8KCPYRAY; '
|
|
|
'bili_ticket_expires=1725451551; b_lsid=B7B10E6101_191B8F11FA5; bp_t_offset_1760559884=973015460700225536;'
|
|
|
' SESSDATA=96c7142d%2C1740938493%2C3a910%2A92CjCc4yaZOS0NpMlzpaXXFlyvjHEGHEZxVtH8JQp1M7im9KrgmNTYIP2F2prPQh4WI4gSVjJtTUt1dGVjMk9SMk9HNkl5MXRWV0tISnNlYzJndGhFVFR1SHVVLWt4UTJjLS1VQ0h1THFmcUY2UU5BV1Jsa2VjTGxDYnpFcnppLVNBQkp3VXdjYzVnIIEC; '
|
|
|
'bili_jct=3a65db4d1ef7bc981b1673000e0bc73c; DedeUserID=1760559884;'
|
|
|
' DedeUserID__ckMd5=b5c900381ecb7bcd; sid=ojanxj62',
|
|
|
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0'
|
|
|
}
|
|
|
# 获取弹幕地址
|
|
|
|
|
|
|
|
|
def get_danmu_url(video_str):
|
|
|
url = video_str
|
|
|
response = requests.get(url=url, headers=headers)
|
|
|
html = response.text
|
|
|
cid = re.search('"cid":(.*?),', html).groups()[0]
|
|
|
danmu_url = f'https://comment.bilibili.com/{cid}.xml'
|
|
|
return danmu_url
|
|
|
|
|
|
# 获取视频地址
|
|
|
|
|
|
|
|
|
def get_vedio(bv):
|
|
|
vedio_url = "https://www.bilibili.com/video/"+bv
|
|
|
return vedio_url
|
|
|
# 获取bv号
|
|
|
|
|
|
|
|
|
def get_bvid(url, pos):
|
|
|
|
|
|
# 通过搜索api“https://api.bilibili.com/x/web-interface/search/all/v2?page=1-15&keyword=”获取前300个视频的bvid
|
|
|
res = requests.get(url=url, headers=headers).text
|
|
|
json_dict = json.loads(res)
|
|
|
return json_dict["data"]["result"][11]["data"][pos]["bvid"]
|
|
|
|
|
|
# 统计弹幕次数
|
|
|
|
|
|
|
|
|
def count_danmu():
|
|
|
# 打开TXT文件以读取数据
|
|
|
file_path = '弹幕.txt'
|
|
|
|
|
|
# 初始化一个空的文本字符串,用于累积所有文本数据
|
|
|
danmu_list = []
|
|
|
|
|
|
with open(file_path, 'r', encoding='utf-8') as file:
|
|
|
for line in file:
|
|
|
# 在这里处理每一行的数据
|
|
|
# 示例:将每一行的弹幕添加到danmu_list列表中
|
|
|
danmu_list.append(line.strip())
|
|
|
|
|
|
# 使用Counter统计弹幕出现次数
|
|
|
danmu_counter = Counter(danmu_list)
|
|
|
# 筛选与AI技术应用相关的弹幕
|
|
|
ai_danmu_counter = {k: v for k, v in danmu_counter.items() if 'AI' in k or '人工智能' in k}
|
|
|
# 将筛选后的弹幕转换为Counter对象
|
|
|
ai_danmu_counter = Counter(ai_danmu_counter)
|
|
|
# 获取AI技术应用方面数量排名前8的弹幕
|
|
|
top_8_ai_danmus = ai_danmu_counter.most_common(8)
|
|
|
|
|
|
# 打印排名前8的AI技术应用方面的弹幕及其出现次数
|
|
|
for idx, (danmu, count) in enumerate(top_8_ai_danmus, 1):
|
|
|
print(f'排名 #{idx}: 弹幕 "{danmu}" 出现次数:{count}')
|
|
|
#top_76016_danmus = danmu_counter.most_common(76016)
|
|
|
# 将AI技术应用方面的统计数据写入Excel
|
|
|
df = pd.DataFrame(list(ai_danmu_counter.items()), columns=['弹幕', '次数'])
|
|
|
df.to_excel('AI技术应用弹幕统计.xlsx', index=False)
|
|
|
|
|
|
# 生成云图
|
|
|
|
|
|
|
|
|
def make_graph():
|
|
|
text_data = ''
|
|
|
with open('AI_danmu.txt', 'r', encoding='utf-8') as file:
|
|
|
for line in file:
|
|
|
text_data += line.strip() + ' '
|
|
|
|
|
|
# 使用jieba进行中文分词
|
|
|
words = jieba.cut(text_data, cut_all=False)
|
|
|
word_list = " ".join(words) #列表转成字符串
|
|
|
|
|
|
# 加载自定义形状图片
|
|
|
shape_mask = np.array(Image.open('img.png'))
|
|
|
|
|
|
# 创建词云图对象,并设置形状
|
|
|
wordcloud = WordCloud(width=2000,
|
|
|
background_color='white',
|
|
|
mask=shape_mask, # 使用自定义形状
|
|
|
contour_width=1,
|
|
|
contour_color='white', # 边框颜色
|
|
|
font_path='STKAITI.TTF', # 用于中文显示的字体文件
|
|
|
max_words=30000, # 最多显示的词语数量
|
|
|
colormap='Blues', # 颜色映射,可以根据需要更改
|
|
|
).generate(word_list)
|
|
|
|
|
|
# 使用形状图片的颜色
|
|
|
image_colors = ImageColorGenerator(shape_mask)
|
|
|
wordcloud.recolor(color_func=image_colors)
|
|
|
|
|
|
# 显示词云图
|
|
|
plt.figure(figsize=(10, 5))
|
|
|
plt.imshow(wordcloud, interpolation='bilinear')
|
|
|
plt.axis('off') # 隐藏坐标轴
|
|
|
plt.title('')
|
|
|
plt.show()
|
|
|
|
|
|
|
|
|
def main():
|
|
|
# warnings.filterwarnings("ignore")
|
|
|
global cnt
|
|
|
for i in range(15):
|
|
|
url = f'https://api.bilibili.com/x/web-interface/search/all/v2?page={i}&keyword=2024巴黎奥运会'
|
|
|
for j in range(20):
|
|
|
print(cnt)
|
|
|
cnt += 1
|
|
|
bv = get_bvid(url, j)
|
|
|
vedio_url_data = get_vedio(bv)
|
|
|
danmu_url = get_danmu_url(vedio_url_data)
|
|
|
# print(danmu_url)
|
|
|
response = requests.get(url=danmu_url, headers=headers)
|
|
|
response.encoding = response.apparent_encoding
|
|
|
pattern = '<d p=".*?">(.*?)</d>'
|
|
|
datalist = re.findall(pattern, response.text)
|
|
|
# print(DataList)
|
|
|
f = open('弹幕.txt', mode='a', encoding='utf-8')
|
|
|
for k in range(len(datalist)):
|
|
|
f.write(datalist[k]+'\n')
|
|
|
f.close()
|
|
|
warnings.filterwarnings("ignore")
|
|
|
count_danmu()
|
|
|
make_graph()
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
main() |