You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

148 lines
7.2 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

# 2.1
# 数据获取
# 导入必要的库
import requests # 用于发送HTTP请求
import json # 用于处理JSON数据
import re # 用于正则表达式匹配
import openpyxl # 用于操作Excel文件
import pandas as pd # 用于数据处理和分析
import matplotlib.pyplot as plt # 导入matplotlib模块pyplot函数并使用as给函数起个别名plt
import jieba # 导入jieba分词模块
import wordcloud # 导入词云图模块
import numpy as np # 导入numpy模块
from wordcloud import ImageColorGenerator # 用于从图片生成颜色以渲染词云
from PIL import Image # 从PIL模块中导入Image函数
from collections import Counter # 导入Counter类用于计数
# 定义目标URL用于搜索Bilibili视频
url = "https://api.bilibili.com/x/web-interface/wbi/search/type?page_size=50&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&search_type=video"
# 设置请求头,模拟浏览器访问
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 SLBrowser/9.0.3.5211 SLBChan/105"
}
# 设置Cookies
cookies = {
"buvid3":"11680F37-E62C-C9EF-B7B2-D198332B61D003377infoc",
"LIVE_BUVID": "AUTO6416610696475006",
"rpdid": "|(u))kkYu|R~0J'uY~|)R)Ylm",
"header_theme_version": "CLOSE",
"FEED_LIVE_VERSION": "V8",
"buvid4": "3385D89E-82EA-B792-EDDB-1C760C74377804485-022082013-T76gCyS8edfpVC%2B%2FEm%2F9gg%3D%3D",
"CURRENT_FNVAL": "4048",
"DedeUserID": "476167968",
"DedeUserID__ckMd5": "99a92c4d7eeebff0",
"b_nut": "100",
"_uuid": "62A11A33-AE63-6B810-5BA8-74B644C9153733847infoc",
"buvid_fp_plain": "undefined",
"enable_web_push": "DISABLE",
"buvid_fp": "283c30b12d458a0311744e346aebd9e9",
"home_feed_column": "5",
"fingerprint": "c828157aaa66d22b3a4b841dc0dd5a96",
"PVID": "2",
"browser_resolution": "2074-1144",
"SESSDATA": "82243f05%2C1742112657%2C2d174%2A92CjA9CO3snstNoEvGickntljsVXZ2EcZHTxBcUE6h9oSQ1xl21MqQVX6gOpdj74xy2VoSVkJfdzRzTy1RUkVyNGhjcDQtejBMMHBHUEUxbWI0bWowU1lWVG9mUmVoQkE4X3lPS2phUjVqS2tHRTVpdlo1U19WQVd4dzQ4RzRpNUtIZjdPX0ZkOS13IIEC",
"bili_jct": "efe32125b9809a8323328e31b7fa7e99",
"hit-dyn-v2": "1",
"bili_ticket": "eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY4MjUxMDYsImlhdCI6MTcyNjU2NTg0NiwicGx0IjotMX0.n0BlSThp9ye0k6US20pEh5gz191I6-1IgplXC1yJ2uI",
"bili_ticket_expires": "1726825046",
"b_lsid": "13EA7118_191FFF83B6D",
"bp_t_offset_476167968": "978129864806629376",
"sid": "6bro4vni",
}
# 初始化一个空列表,用于存储弹幕内容
danmu_list = []
# 定义一个函数,用于获取指定视频的弹幕
def get_danmu(bvid,headers):
cid_url = "https://api.bilibili.com/x/web-interface/view?bvid=" + bvid # 构造获取视频cid的URL
cid_req = requests.get(cid_url, headers=headers) # 发送请求获取视频cid
cid_res = json.loads(cid_req.text) # 将响应内容转换为JSON格式
cid = cid_res['data']['cid'] # 从响应中提取视频的cid
danmu_url = "https://comment.bilibili.com/" + str(cid) + ".xml" # 构造获取弹幕的URL
danmu_req = requests.get(danmu_url, headers=headers) # 发送请求获取弹幕内容
danmu_req.encoding = 'utf-8' # 设置响应内容的编码为utf-8
danmu_list = re.findall('<d p=".*?">(.*?)</d>',danmu_req.text) # 使用正则表达式匹配弹幕内容
return danmu_list # 返回匹配到的弹幕列表
# 循环发送请求,获取多页视频数据
for i in range(10):
sess = requests.session() # 创建一个requests.session()对象,用于维持会话
req = sess.get(url + "&page=" + str(i+1), headers=headers, cookies=cookies) # 构造分页请求的URL发送请求
res = json.loads(req.text) # 将响应内容转换为JSON格式
for video in res['data']['result']: # 遍历响应中的视频数据
danmu_list = danmu_list + get_danmu(video['bvid'],headers) # 调用函数获取每个视频的弹幕并追加到danmu_list列表中
df = pd.DataFrame(danmu_list) # 将弹幕列表转换为pandas DataFrame
df.to_excel('danmu.xlsx',index=False,engine='openpyxl') # 将DataFrame保存到Excel文件
print("弹幕已保存") # 打印提示信息
# 2.2
# 数据统计
# 定义关键词列表
keywords = ['Ai','智能','3D','技术','科学']
# 过滤包含关键词的弹幕
filtered_danmu_list = [danmu for danmu in danmu_list if any(keyword in danmu.lower() for keyword in keywords)]
# 对过滤后的弹幕进行计数
danmu_count = Counter(filtered_danmu_list)
# 获取数量排名前8的弹幕
top_8_count = danmu_count.most_common(8)
# 创建新的Excel表写入统计结果
workbook = openpyxl.Workbook()
sheet = workbook.active
# 写入表头
sheet['A1'] = '弹幕'
sheet['B1'] = '数量'
# 写入数据
for i, (danmu, count) in enumerate(top_8_count, start=2):
sheet[f'A{i}'] = danmu
sheet[f'B{i}'] = count
# 打印提示信息
print("弹幕已过滤")
# 保存Excel表
workbook.save('danmu_count.xlsx')
# 2.3
# 数据可视化
# 读取Excel文件
df = pd.read_excel('danmu_count.xlsx', usecols=['弹幕'])
# 获取弹幕列表
text = ' '.join(df['弹幕'].astype(str).tolist()) # 提取弹幕内容并合并为一个长字符串
cut_text = jieba.cut(text) # 分词处理
word = ' '.join(cut_text) # 以空格分割文本
# 设置停用词
stopwords = set(["", "", "", "", "", "", "", "", ""])
# 读取图片
pic = np.array(Image.open('1.png')) # 打开并加载图片文件
image_colors = ImageColorGenerator(pic) # 创建颜色生成器,用于从图片中提取颜色
# 配置并生成词云图
wd = wordcloud.WordCloud(
mask=pic, # 使用图片作为词云的形状
font_path='simhei.ttf', # 指定字体路径,以支持中文显示
background_color='white', # 设置背景颜色为白色
)
wd.generate(word) # 根据提供的文本生成词云
# 图片颜色渲染词云图的颜色用color_func指定
plt.imshow(wd.recolor(color_func=image_colors), interpolation='bilinear') # 展示词云图
plt.axis('off') # 关闭显示x轴、y轴下标
plt.show() # 显示图像