You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

141 lines
6.0 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import requests
import re
import warnings
import json
import jieba
import numpy as np
from wordcloud import WordCloud, ImageColorGenerator
import matplotlib.pyplot as plt
from PIL import Image
import pandas as pd
from collections import Counter
#发送请求
headers = {
'cookie': 'b_nut=1659613422; buvid3=6C07DC9F-EE29-7F28-2B63-1BF4ECD504A422941infoc; '
'CURRENT_FNVAL=4048; header_theme_version=CLOSE; '
'buvid4=92532619-00E5-BF92-443B-595CD15DE59481123-023013113-97xIUW%2FWJtRnoJI8Rbvu4Q%3D%3D;'
' enable_web_push=DISABLE; rpdid=|(u))kkYu|J|0J\'u~u|)u)RR); '
'hit-dyn-v2=1; FEED_LIVE_VERSION=V_WATCHLATER_PIP_WINDOW3; '
'LIVE_BUVID=AUTO2617189721183630; PVID=1; buvid_fp_plain=undefined; '
'CURRENT_QUALITY=80; _uuid=8108A2C6D-A7AD-7F210-B10E5-EA35A5B47DA391233infoc; '
'home_feed_column=5; browser_resolution=1545-857; '
'bsource=search_bing; fingerprint=0c7279b7c69b9542a76b8d9df9b7872a; '
'buvid_fp=0c7279b7c69b9542a76b8d9df9b7872a; '
'bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjU0NTE2MTEsImlhdCI6MTcyNTE5MjM1MSwicGx0IjotMX0.9HAkh-aLUFL3i2asyrGNSGwvZnlCdO1qHnr8KCPYRAY; '
'bili_ticket_expires=1725451551; b_lsid=B7B10E6101_191B8F11FA5; bp_t_offset_1760559884=973015460700225536;'
' SESSDATA=96c7142d%2C1740938493%2C3a910%2A92CjCc4yaZOS0NpMlzpaXXFlyvjHEGHEZxVtH8JQp1M7im9KrgmNTYIP2F2prPQh4WI4gSVjJtTUt1dGVjMk9SMk9HNkl5MXRWV0tISnNlYzJndGhFVFR1SHVVLWt4UTJjLS1VQ0h1THFmcUY2UU5BV1Jsa2VjTGxDYnpFcnppLVNBQkp3VXdjYzVnIIEC; '
'bili_jct=3a65db4d1ef7bc981b1673000e0bc73c; DedeUserID=1760559884;'
' DedeUserID__ckMd5=b5c900381ecb7bcd; sid=ojanxj62',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0'
}
cnt = 1
# 获取弹幕地址
def GetDanMuUrl(video_str):
url = video_str
response = requests.get(url=url, headers=headers)
html = response.text
cid = re.search('"cid":(.*?),', html).groups()[0]
danmu_url = f'https://comment.bilibili.com/{cid}.xml'
return danmu_url
# 获取bv号
def GetBvid(url, pos):
# 通过搜索api“https://api.bilibili.com/x/web-interface/search/all/v2?page=1-15&keyword=”获取前300个视频的bvid
res = requests.get(url=url, headers=headers).text
json_dict = json.loads(res)
return json_dict["data"]["result"][11]["data"][pos]["bvid"]
# 获取视频地址
def GetVedio(bv):
vedio_url = "https://www.bilibili.com/video/"+bv
return vedio_url
# 统计弹幕次数
def CountDanmu():
# 打开TXT文件以读取数据
file_path = '弹幕.txt'
# 初始化一个空的文本字符串,用于累积所有文本数据
danmu_list = []
with open(file_path, 'r', encoding='utf-8') as file:
for line in file:
# 在这里处理每一行的数据
# 示例将每一行的弹幕添加到danmu_list列表中
danmu_list.append(line.strip())
# 使用Counter统计弹幕出现次数
danmu_counter = Counter(danmu_list)
# 先筛选与AI技术应用相关的弹幕
ai_danmu_counter = {k: v for k, v in danmu_counter.items() if 'AI' in k or '人工智能' in k}
# 然后将筛选后的弹幕转换为Counter对象
ai_danmu_counter = Counter(ai_danmu_counter)
# 最后获取AI技术应用方面数量排名前8的弹幕
top_8_ai_danmus = ai_danmu_counter.most_common(8)
# 打印排名前8的AI技术应用方面的弹幕及其出现的次数
for idx, (danmu, count) in enumerate(top_8_ai_danmus, 1):
print(f'排名 #{idx}: 弹幕 "{danmu}" 出现次数:{count}')
# 将AI技术应用方面的统计数据写入Excel表格中
df = pd.DataFrame(list(ai_danmu_counter.items()), columns=['弹幕', '次数'])
df.to_excel('AI技术应用弹幕统计.xlsx', index=False)
# 生成词云图
def make_graph():
text_data = ''
with open('AI_danmu.txt', 'r', encoding='utf-8') as file:
for line in file:
text_data += line.strip() + ' '
# 使用jieba进行中文分词
words = jieba.cut(text_data, cut_all=False)
word_list = " ".join(words) #列表转成字符串
# 创建词云图对象,并设置形状
wordcloud = WordCloud(width=2000,
background_color='white',
mask=shape_mask, # 使用自定义形状
contour_width=1,
contour_color='white', # 边框颜色
font_path='STKAITI.TTF', # 用于中文显示的字体文件
max_words=30000, # 最多显示的词语数量
colormap='Blues', # 颜色映射,可以根据需要更改
).generate(word_list)
# 使用形状图片的颜色
image_colors = ImageColorGenerator(shape_mask)
wordcloud.recolor(color_func=image_colors)
def main():
# warnings.filterwarnings("ignore")
global cnt
for i in range(15):
url = f'https://api.bilibili.com/x/web-interface/search/all/v2?page={i}&keyword=2024巴黎奥运会'
for j in range(20):
print(cnt)
cnt += 1
vedio_url_data = vedio_url_data(bv)
danmu_url = danmu_url(vedio_url_data)
# print(DanmuUrl)
response = requests.get(url=danmu_url, headers=headers)
response.encoding = response.apparent_encoding
pattern = '<d p=".*?">(.*?)</d>'
datalist = re.findall(pattern, response.text)
# print(DataList)
f = open('弹幕.txt', mode='a', encoding='utf-8')
for k in range(len(datalist)):
f.write(datalist[k]+'\n')
f.close()
warnings.filterwarnings("ignore")
CountDanmu()
make_graph()
if __name__ == '__main__':
main()