From e67f4e7e3819ad896d974b39a743d05bced6eedd Mon Sep 17 00:00:00 2001 From: p4oufk2iw <2488672761@qq.com> Date: Thu, 19 Sep 2024 00:32:35 +0800 Subject: [PATCH] ADD file via upload --- spider_cprofile.py | 169 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 169 insertions(+) create mode 100644 spider_cprofile.py diff --git a/spider_cprofile.py b/spider_cprofile.py new file mode 100644 index 0000000..c43457a --- /dev/null +++ b/spider_cprofile.py @@ -0,0 +1,169 @@ +import requests +import re +import warnings +import json +import jieba +import numpy as np +from wordcloud import WordCloud, ImageColorGenerator +import matplotlib.pyplot as plt +from PIL import Image +import pandas as pd +from collections import Counter +import cProfile +import pstats + + +cnt = 1 +headers = { + 'cookie': 'b_nut=1659613422; buvid3=6C07DC9F-EE29-7F28-2B63-1BF4ECD504A422941infoc; ' + 'CURRENT_FNVAL=4048; header_theme_version=CLOSE; ' + 'buvid4=92532619-00E5-BF92-443B-595CD15DE59481123-023013113-97xIUW%2FWJtRnoJI8Rbvu4Q%3D%3D;' + ' enable_web_push=DISABLE; rpdid=|(u))kkYu|J|0J\'u~u|)u)RR); ' + 'hit-dyn-v2=1; FEED_LIVE_VERSION=V_WATCHLATER_PIP_WINDOW3; ' + 'LIVE_BUVID=AUTO2617189721183630; PVID=1; buvid_fp_plain=undefined; ' + 'CURRENT_QUALITY=80; _uuid=8108A2C6D-A7AD-7F210-B10E5-EA35A5B47DA391233infoc; ' + 'home_feed_column=5; browser_resolution=1545-857; ' + 'bsource=search_bing; fingerprint=0c7279b7c69b9542a76b8d9df9b7872a; ' + 'buvid_fp=0c7279b7c69b9542a76b8d9df9b7872a; ' + 'bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjU0NTE2MTEsImlhdCI6MTcyNTE5MjM1MSwicGx0IjotMX0.9HAkh-aLUFL3i2asyrGNSGwvZnlCdO1qHnr8KCPYRAY; ' + 'bili_ticket_expires=1725451551; b_lsid=B7B10E6101_191B8F11FA5; bp_t_offset_1760559884=973015460700225536;' + ' SESSDATA=96c7142d%2C1740938493%2C3a910%2A92CjCc4yaZOS0NpMlzpaXXFlyvjHEGHEZxVtH8JQp1M7im9KrgmNTYIP2F2prPQh4WI4gSVjJtTUt1dGVjMk9SMk9HNkl5MXRWV0tISnNlYzJndGhFVFR1SHVVLWt4UTJjLS1VQ0h1THFmcUY2UU5BV1Jsa2VjTGxDYnpFcnppLVNBQkp3VXdjYzVnIIEC; ' + 'bili_jct=3a65db4d1ef7bc981b1673000e0bc73c; DedeUserID=1760559884;' + ' DedeUserID__ckMd5=b5c900381ecb7bcd; sid=ojanxj62', + 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0' +} +# 获取弹幕地址 + + +def get_danmu_url(video_str): + url = video_str + response = requests.get(url=url, headers=headers) + html = response.text + cid = re.search('"cid":(.*?),', html).groups()[0] + danmu_url = f'https://comment.bilibili.com/{cid}.xml' + return danmu_url + +# 获取视频地址 + + +def get_vedio(bv): + vedio_url = "https://www.bilibili.com/video/"+bv + return vedio_url +# 获取bv号 + + +def get_bvid(url, pos): + + # 通过搜索api“https://api.bilibili.com/x/web-interface/search/all/v2?page=1-15&keyword=”获取前300个视频的bvid + res = requests.get(url=url, headers=headers).text + json_dict = json.loads(res) + return json_dict["data"]["result"][11]["data"][pos]["bvid"] + +# 统计弹幕次数 + + +def count_danmu(): + # 打开TXT文件以读取数据 + file_path = '弹幕.txt' + + # 初始化一个空的文本字符串,用于累积所有文本数据 + danmu_list = [] + + with open(file_path, 'r', encoding='utf-8') as file: + for line in file: + # 在这里处理每一行的数据 + # 示例:将每一行的弹幕添加到danmu_list列表中 + danmu_list.append(line.strip()) + + # 使用Counter统计弹幕出现次数 + danmu_counter = Counter(danmu_list) + # 筛选与AI技术应用相关的弹幕 + ai_danmu_counter = {k: v for k, v in danmu_counter.items() if 'AI' in k or '人工智能' in k} + # 将筛选后的弹幕转换为Counter对象 + ai_danmu_counter = Counter(ai_danmu_counter) + # 获取AI技术应用方面数量排名前8的弹幕 + top_8_ai_danmus = ai_danmu_counter.most_common(8) + + # 打印排名前8的AI技术应用方面的弹幕及其出现次数 + for idx, (danmu, count) in enumerate(top_8_ai_danmus, 1): + print(f'排名 #{idx}: 弹幕 "{danmu}" 出现次数:{count}') + #top_76016_danmus = danmu_counter.most_common(76016) + # 将AI技术应用方面的统计数据写入Excel + df = pd.DataFrame(list(ai_danmu_counter.items()), columns=['弹幕', '次数']) + df.to_excel('AI技术应用弹幕统计.xlsx', index=False) + +# 生成云图 + + +def make_graph(): + text_data = '' + with open('AI_danmu.txt', 'r', encoding='utf-8') as file: + for line in file: + text_data += line.strip() + ' ' + + # 使用jieba进行中文分词 + words = jieba.cut(text_data, cut_all=False) + word_list = " ".join(words) #列表转成字符串 + + # 加载自定义形状图片 + shape_mask = np.array(Image.open('img.png')) + + # 创建词云图对象,并设置形状 + wordcloud = WordCloud(width=2000, + background_color='white', + mask=shape_mask, # 使用自定义形状 + contour_width=1, + contour_color='white', # 边框颜色 + font_path='STKAITI.TTF', # 用于中文显示的字体文件 + max_words=30000, # 最多显示的词语数量 + colormap='Blues', # 颜色映射,可以根据需要更改 + ).generate(word_list) + + # 使用形状图片的颜色 + image_colors = ImageColorGenerator(shape_mask) + wordcloud.recolor(color_func=image_colors) + + # 显示词云图 + plt.figure(figsize=(10, 5)) + plt.imshow(wordcloud, interpolation='bilinear') + plt.axis('off') # 隐藏坐标轴 + plt.title('') + plt.show() + + +def main(): + # warnings.filterwarnings("ignore") + global cnt + profiler = cProfile.Profile() + profiler.enable() + + for i in range(15): + url = f'https://api.bilibili.com/x/web-interface/search/all/v2?page={i}&keyword=2024巴黎奥运会' + for j in range(20): + print(cnt) + cnt += 1 + bv = get_bvid(url, j) + vedio_url_data = get_vedio(bv) + danmu_url = get_danmu_url(vedio_url_data) + # print(danmu_url) + response = requests.get(url=danmu_url, headers=headers) + response.encoding = response.apparent_encoding + pattern = '(.*?)' + datalist = re.findall(pattern, response.text) + # print(DataList) + f = open('弹幕.txt', mode='a', encoding='utf-8') + for k in range(len(datalist)): + f.write(datalist[k]+'\n') + f.close() + + profiler.disable() + stats = pstats.Stats(profiler).sort_stats('cumulative') + stats.print_stats() + + warnings.filterwarnings("ignore") + count_danmu() + make_graph() + + +if __name__ == '__main__': + main() \ No newline at end of file