From 97665b1e52522febdfbf3e93db4bfa5dbd88ee47 Mon Sep 17 00:00:00 2001 From: pghf8vti9 <203737553@qq.com> Date: Wed, 18 Sep 2024 18:11:58 +0800 Subject: [PATCH] =?UTF-8?q?=E6=95=B0=E6=8D=AE=E8=8E=B7=E5=8F=96=E3=80=81?= =?UTF-8?q?=E7=BB=9F=E8=AE=A1=E5=8F=8A=E5=8F=AF=E8=A7=86=E5=8C=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- main.py | 148 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 148 insertions(+) create mode 100644 main.py diff --git a/main.py b/main.py new file mode 100644 index 0000000..5b1b245 --- /dev/null +++ b/main.py @@ -0,0 +1,148 @@ +# 2.1 +# 数据获取 + +# 导入必要的库 +import requests # 用于发送HTTP请求 +import json # 用于处理JSON数据 +import re # 用于正则表达式匹配 +import openpyxl # 用于操作Excel文件 +import pandas as pd # 用于数据处理和分析 +import matplotlib.pyplot as plt # 导入matplotlib模块pyplot函数并使用as给函数起个别名plt +import jieba # 导入jieba分词模块 +import wordcloud # 导入词云图模块 +import numpy as np # 导入numpy模块 +from wordcloud import ImageColorGenerator # 用于从图片生成颜色以渲染词云 +from PIL import Image # 从PIL模块中导入Image函数 +from collections import Counter # 导入Counter类,用于计数 + +# 定义目标URL,用于搜索Bilibili视频 +url = "https://api.bilibili.com/x/web-interface/wbi/search/type?page_size=50&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&search_type=video" + +# 设置请求头,模拟浏览器访问 +headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 SLBrowser/9.0.3.5211 SLBChan/105" +} + +# 设置Cookies +cookies = { + "buvid3":"11680F37-E62C-C9EF-B7B2-D198332B61D003377infoc", + "LIVE_BUVID": "AUTO6416610696475006", + "rpdid": "|(u))kkYu|R~0J'uY~|)R)Ylm", + "header_theme_version": "CLOSE", + "FEED_LIVE_VERSION": "V8", + "buvid4": "3385D89E-82EA-B792-EDDB-1C760C74377804485-022082013-T76gCyS8edfpVC%2B%2FEm%2F9gg%3D%3D", + "CURRENT_FNVAL": "4048", + "DedeUserID": "476167968", + "DedeUserID__ckMd5": "99a92c4d7eeebff0", + "b_nut": "100", + "_uuid": "62A11A33-AE63-6B810-5BA8-74B644C9153733847infoc", + "buvid_fp_plain": "undefined", + "enable_web_push": "DISABLE", + "buvid_fp": "283c30b12d458a0311744e346aebd9e9", + "home_feed_column": "5", + "fingerprint": "c828157aaa66d22b3a4b841dc0dd5a96", + "PVID": "2", + "browser_resolution": "2074-1144", + "SESSDATA": "82243f05%2C1742112657%2C2d174%2A92CjA9CO3snstNoEvGickntljsVXZ2EcZHTxBcUE6h9oSQ1xl21MqQVX6gOpdj74xy2VoSVkJfdzRzTy1RUkVyNGhjcDQtejBMMHBHUEUxbWI0bWowU1lWVG9mUmVoQkE4X3lPS2phUjVqS2tHRTVpdlo1U19WQVd4dzQ4RzRpNUtIZjdPX0ZkOS13IIEC", + "bili_jct": "efe32125b9809a8323328e31b7fa7e99", + "hit-dyn-v2": "1", + "bili_ticket": "eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY4MjUxMDYsImlhdCI6MTcyNjU2NTg0NiwicGx0IjotMX0.n0BlSThp9ye0k6US20pEh5gz191I6-1IgplXC1yJ2uI", + "bili_ticket_expires": "1726825046", + "b_lsid": "13EA7118_191FFF83B6D", + "bp_t_offset_476167968": "978129864806629376", + "sid": "6bro4vni", +} + +# 初始化一个空列表,用于存储弹幕内容 +danmu_list = [] + +# 定义一个函数,用于获取指定视频的弹幕 +def get_danmu(bvid,headers): + cid_url = "https://api.bilibili.com/x/web-interface/view?bvid=" + bvid # 构造获取视频cid的URL + cid_req = requests.get(cid_url, headers=headers) # 发送请求获取视频cid + cid_res = json.loads(cid_req.text) # 将响应内容转换为JSON格式 + cid = cid_res['data']['cid'] # 从响应中提取视频的cid + danmu_url = "https://comment.bilibili.com/" + str(cid) + ".xml" # 构造获取弹幕的URL + danmu_req = requests.get(danmu_url, headers=headers) # 发送请求获取弹幕内容 + danmu_req.encoding = 'utf-8' # 设置响应内容的编码为utf-8 + danmu_list = re.findall('(.*?)',danmu_req.text) # 使用正则表达式匹配弹幕内容 + return danmu_list # 返回匹配到的弹幕列表 + +# 循环发送请求,获取多页视频数据 +for i in range(10): + sess = requests.session() # 创建一个requests.session()对象,用于维持会话 + req = sess.get(url + "&page=" + str(i+1), headers=headers, cookies=cookies) # 构造分页请求的URL,发送请求 + res = json.loads(req.text) # 将响应内容转换为JSON格式 + for video in res['data']['result']: # 遍历响应中的视频数据 + danmu_list = danmu_list + get_danmu(video['bvid'],headers) # 调用函数获取每个视频的弹幕,并追加到danmu_list列表中 + +df = pd.DataFrame(danmu_list) # 将弹幕列表转换为pandas DataFrame +df.to_excel('danmu.xlsx',index=False,engine='openpyxl') # 将DataFrame保存到Excel文件 +print("弹幕已保存") # 打印提示信息 + + +# 2.2 +# 数据统计 + +# 定义关键词列表 +keywords = ['Ai','智能','3D','技术','科学'] + +# 过滤包含关键词的弹幕 +filtered_danmu_list = [danmu for danmu in danmu_list if any(keyword in danmu.lower() for keyword in keywords)] + +# 对过滤后的弹幕进行计数 +danmu_count = Counter(filtered_danmu_list) + +# 获取数量排名前8的弹幕 +top_8_count = danmu_count.most_common(8) + +# 创建新的Excel表,写入统计结果 +workbook = openpyxl.Workbook() +sheet = workbook.active + +# 写入表头 +sheet['A1'] = '弹幕' +sheet['B1'] = '数量' + +# 写入数据 +for i, (danmu, count) in enumerate(top_8_count, start=2): + sheet[f'A{i}'] = danmu + sheet[f'B{i}'] = count + +# 打印提示信息 +print("弹幕已过滤") + +# 保存Excel表 +workbook.save('danmu_count.xlsx') + + +# 2.3 +# 数据可视化 + +# 读取Excel文件 +df = pd.read_excel('danmu_count.xlsx', usecols=['弹幕']) + +# 获取弹幕列表 +text = ' '.join(df['弹幕'].astype(str).tolist()) # 提取弹幕内容并合并为一个长字符串 +cut_text = jieba.cut(text) # 分词处理 +word = ' '.join(cut_text) # 以空格分割文本 + +# 设置停用词 +stopwords = set(["的", "了", "和", "是", "在", "我", "你", "他", "她"]) + +# 读取图片 +pic = np.array(Image.open('1.png')) # 打开并加载图片文件 +image_colors = ImageColorGenerator(pic) # 创建颜色生成器,用于从图片中提取颜色 + +# 配置并生成词云图 +wd = wordcloud.WordCloud( + mask=pic, # 使用图片作为词云的形状 + font_path='simhei.ttf', # 指定字体路径,以支持中文显示 + background_color='white', # 设置背景颜色为白色 + ) +wd.generate(word) # 根据提供的文本生成词云 + +# 图片颜色渲染词云图的颜色,用color_func指定 +plt.imshow(wd.recolor(color_func=image_colors), interpolation='bilinear') # 展示词云图 +plt.axis('off') # 关闭显示x轴、y轴下标 +plt.show() # 显示图像 \ No newline at end of file