import requests import re import warnings import json import jieba import numpy as np from wordcloud import WordCloud, ImageColorGenerator import matplotlib.pyplot as plt from PIL import Image import pandas as pd from collections import Counter #发送请求 headers = { 'cookie': 'b_nut=1659613422; buvid3=6C07DC9F-EE29-7F28-2B63-1BF4ECD504A422941infoc; ' 'CURRENT_FNVAL=4048; header_theme_version=CLOSE; ' 'buvid4=92532619-00E5-BF92-443B-595CD15DE59481123-023013113-97xIUW%2FWJtRnoJI8Rbvu4Q%3D%3D;' ' enable_web_push=DISABLE; rpdid=|(u))kkYu|J|0J\'u~u|)u)RR); ' 'hit-dyn-v2=1; FEED_LIVE_VERSION=V_WATCHLATER_PIP_WINDOW3; ' 'LIVE_BUVID=AUTO2617189721183630; PVID=1; buvid_fp_plain=undefined; ' 'CURRENT_QUALITY=80; _uuid=8108A2C6D-A7AD-7F210-B10E5-EA35A5B47DA391233infoc; ' 'home_feed_column=5; browser_resolution=1545-857; ' 'bsource=search_bing; fingerprint=0c7279b7c69b9542a76b8d9df9b7872a; ' 'buvid_fp=0c7279b7c69b9542a76b8d9df9b7872a; ' 'bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjU0NTE2MTEsImlhdCI6MTcyNTE5MjM1MSwicGx0IjotMX0.9HAkh-aLUFL3i2asyrGNSGwvZnlCdO1qHnr8KCPYRAY; ' 'bili_ticket_expires=1725451551; b_lsid=B7B10E6101_191B8F11FA5; bp_t_offset_1760559884=973015460700225536;' ' SESSDATA=96c7142d%2C1740938493%2C3a910%2A92CjCc4yaZOS0NpMlzpaXXFlyvjHEGHEZxVtH8JQp1M7im9KrgmNTYIP2F2prPQh4WI4gSVjJtTUt1dGVjMk9SMk9HNkl5MXRWV0tISnNlYzJndGhFVFR1SHVVLWt4UTJjLS1VQ0h1THFmcUY2UU5BV1Jsa2VjTGxDYnpFcnppLVNBQkp3VXdjYzVnIIEC; ' 'bili_jct=3a65db4d1ef7bc981b1673000e0bc73c; DedeUserID=1760559884;' ' DedeUserID__ckMd5=b5c900381ecb7bcd; sid=ojanxj62', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0' } cnt = 1 # 获取弹幕地址 def GetDanMuUrl(video_str): url = video_str response = requests.get(url=url, headers=headers) html = response.text cid = re.search('"cid":(.*?),', html).groups()[0] danmu_url = f'https://comment.bilibili.com/{cid}.xml' return danmu_url # 获取bv号 def GetBvid(url, pos): # 通过搜索api“https://api.bilibili.com/x/web-interface/search/all/v2?page=1-15&keyword=”获取前300个视频的bvid res = requests.get(url=url, headers=headers).text json_dict = json.loads(res) return json_dict["data"]["result"][11]["data"][pos]["bvid"] # 获取视频地址 def GetVedio(bv): vedio_url = "https://www.bilibili.com/video/"+bv return vedio_url # 统计弹幕次数 def CountDanmu(): # 打开TXT文件以读取数据 file_path = '弹幕.txt' # 初始化一个空的文本字符串,用于累积所有文本数据 danmu_list = [] with open(file_path, 'r', encoding='utf-8') as file: for line in file: # 在这里处理每一行的数据 # 示例:将每一行的弹幕添加到danmu_list列表中 danmu_list.append(line.strip()) # 使用Counter统计弹幕出现次数 danmu_counter = Counter(danmu_list) # 先筛选与AI技术应用相关的弹幕 ai_danmu_counter = {k: v for k, v in danmu_counter.items() if 'AI' in k or '人工智能' in k} # 然后将筛选后的弹幕转换为Counter对象 ai_danmu_counter = Counter(ai_danmu_counter) # 最后获取AI技术应用方面数量排名前8的弹幕 top_8_ai_danmus = ai_danmu_counter.most_common(8) # 打印排名前8的AI技术应用方面的弹幕及其出现的次数 for idx, (danmu, count) in enumerate(top_8_ai_danmus, 1): print(f'排名 #{idx}: 弹幕 "{danmu}" 出现次数:{count}') # 将AI技术应用方面的统计数据写入Excel表格中 df = pd.DataFrame(list(ai_danmu_counter.items()), columns=['弹幕', '次数']) df.to_excel('AI技术应用弹幕统计.xlsx', index=False) # 生成词云图 def make_graph(): text_data = '' with open('AI_danmu.txt', 'r', encoding='utf-8') as file: for line in file: text_data += line.strip() + ' ' # 使用jieba进行中文分词 words = jieba.cut(text_data, cut_all=False) word_list = " ".join(words) #列表转成字符串 # 创建词云图对象,并设置形状 wordcloud = WordCloud(width=2000, background_color='white', mask=shape_mask, # 使用自定义形状 contour_width=1, contour_color='white', # 边框颜色 font_path='STKAITI.TTF', # 用于中文显示的字体文件 max_words=30000, # 最多显示的词语数量 colormap='Blues', # 颜色映射,可以根据需要更改 ).generate(word_list) # 使用形状图片的颜色 image_colors = ImageColorGenerator(shape_mask) wordcloud.recolor(color_func=image_colors) def main(): # warnings.filterwarnings("ignore") global cnt for i in range(15): url = f'https://api.bilibili.com/x/web-interface/search/all/v2?page={i}&keyword=2024巴黎奥运会' for j in range(20): print(cnt) cnt += 1 vedio_url_data = vedio_url_data(bv) danmu_url = danmu_url(vedio_url_data) # print(DanmuUrl) response = requests.get(url=danmu_url, headers=headers) response.encoding = response.apparent_encoding pattern = '(.*?)' datalist = re.findall(pattern, response.text) # print(DataList) f = open('弹幕.txt', mode='a', encoding='utf-8') for k in range(len(datalist)): f.write(datalist[k]+'\n') f.close() warnings.filterwarnings("ignore") CountDanmu() make_graph() if __name__ == '__main__': main()