From a8cb733de76d0830785e308b90cd5cc726258773 Mon Sep 17 00:00:00 2001 From: psp2mw64e <2304110814@qq.com> Date: Mon, 16 Sep 2024 11:31:45 +0800 Subject: [PATCH] ADD file via upload --- crawl.py | 153 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 153 insertions(+) create mode 100644 crawl.py diff --git a/crawl.py b/crawl.py new file mode 100644 index 0000000..c5dfbc6 --- /dev/null +++ b/crawl.py @@ -0,0 +1,153 @@ +import requests +import json +import re +import jieba +import time +import wordcloud +import numpy as np +from PIL import Image +import pandas as pd + +def get_search(v_keyword, max_videos=300): + video_count = 0 # 记录已爬取的视频数量 + page_size = 30 # 每页视频数量 + max_page = max_videos // page_size # 计算需要爬取的页数 + + for page in range(1, max_page + 1): + if video_count >= max_videos: + break + + # 请求地址 + url = 'https://api.bilibili.com/x/web-interface/wbi/search/type' + # 请求头 + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36', + 'Referer': 'https://www.bilibili.com/video/BV1xxxxxxx', # 修改为目标视频的 URL + 'Accept': 'application/json, text/plain, */*', + 'cookie':'buvid3 = 2C1B627D - C611 - D842 - 783F - 1D6D6205BE8E30149infoc;b_nut = 1716183429;buvid4 = 58C6836B - CD3C - B8D8 - B75E - 014E5D19E71730149 - 024052005 - BX9TPJ1Qd1SMwWPyECcJSQ % 3D % 3D;_uuid = 88852C4D - 2847 - F2A7 - 1043A - FCF7104D7E6DD13761infoc;rpdid = | (JlklRl)~Yu0Ju~uYu)Jlu~; DedeUserID=22863112; DedeUserID__ckMd5=f744f10bf9a83bc2; enable_web_push=DISABLE; header_theme_version=CLOSE; hit-dyn-v2=1; buvid_fp_plain=undefined; LIVE_BUVID=AUTO1617162131622559; CURRENT_FNVAL=4048; fingerprint=3b1cd80e32b9900fd22fdd2e95f8b55f; home_feed_column=5; CURRENT_QUALITY=80; browser_resolution=1488-738; PVID=1; buvid_fp=3b1cd80e32b9900fd22fdd2e95f8b55f; SESSDATA=234b58b7%2C1741842619%2C4f93a%2A91CjAqN6Nk1W9q-NW-cK5kX_jJcu9jtvJEQe4MTO1eHvrS8hz-apd6jYPG1Lm9-_NlfGQSVkZnM1FNa2hGYzl5a1pCbkkzZ2JpQlc1RFRYUTY2N3VlU0Q5ZmVTZ01zLUE3aXVPY0VzcXpGeVNDMVU5dUNsNlYxbHpRYXZab3h0amEyQ011ZFBCdTl3IIEC; bili_jct=4e3c35728fb766db9f42f2eded6a02c2; b_lsid=86E10C228_191F3A74756; bp_t_offset_22863112=977239569625776128' + } + + params = { + '__refresh__': 'true', + 'page': page, + 'page_size': page_size, + 'keyword': v_keyword, + 'order': 'totalrank', + 'search_type': 'video', + 'csrf': 'your_bili_jct_value' # 确保这里是从 Cookie 中获取的有效值 + } + + # 向页面发送请求 + r = requests.get(url, headers=headers, params=params) + if r.status_code != 200: + print(f"请求失败,状态码:{r.status_code}") + break + + result = r.json().get('data', {}).get('result', []) + if not result: + print(f"第{page}页没有结果,停止爬取。") + break + + + # 处理当前页的搜索结果 + for index in result: + if video_count >= max_videos: + break + bv_id = index['bvid'] + print(f"正在处理视频: {bv_id}") # 打印视频ID + with open('bv_id_1.txt', mode='a', encoding='utf-8') as f: + f.write(bv_id) + f.write('\n') + + # 获取视频弹幕的 cid + cid_url = f'https://api.bilibili.com/x/player/pagelist?bvid={bv_id}&jsonp=jsonp' + response = requests.get(url=cid_url, headers=headers) + if response.status_code != 200: + print(f"获取 cid 失败,状态码:{response.status_code}") + continue + + res_dict = json.loads(response.content.decode('utf-8')) + values = res_dict.get('data', []) + if not values: + print(f"没有找到视频 {bv_id} 的 cid。") + continue + + for cid_values in values: + cid = str(cid_values.get('cid')) + + # 获取弹幕内容 + danmu_url = f'https://api.bilibili.com/x/v1/dm/list.so?oid={cid}' + response = requests.get(danmu_url, headers=headers) + if response.status_code != 200: + print(f"获取弹幕失败,状态码:{response.status_code}") + continue + + # 提取弹幕文本 + content_list = re.findall('(.*?)', response.content.decode('utf-8')) + if not content_list: + print(f"视频 {bv_id} 没有弹幕。") + continue + + print(f"视频 {bv_id} 的弹幕数量: {len(content_list)}") # 打印弹幕数量 + + # 保存弹幕文本 + with open('弹幕文本.txt', mode='a', encoding='utf-8') as f: + for content in content_list: + f.write(content) + f.write('\n') + + video_count += 1 + print(f"当前已爬取视频数量: {video_count}") + time.sleep(1) # 添加延迟以减少被封的风险 + +def get_wordcloud(): + """生成词云图""" + with open('弹幕文本.txt', encoding='utf-8') as f: + txt = f.read().strip() # 读取文件并去除空白字符 + if not txt: + print("错误:弹幕文本为空,无法生成词云。") + return + # 切词并生成词云 + stopwords = set(["哈哈", "哈哈哈", "哈哈哈哈", "哈哈哈哈哈", "哈哈哈哈哈哈", "哈哈哈哈哈哈哈", "哈哈哈哈哈哈哈哈哈", + "哈哈哈哈哈哈哈哈哈", "哈哈哈哈哈哈哈哈哈哈", "哈哈哈哈哈哈哈哈哈哈哈哈哈", "哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈", + "哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈", "哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈", '的', '了', '啊','在', '是', '我', '你', '他', '她', '它', '有', '和', '这', '那', '这个','不是','真的','我们', '你们', '他们']) + string = ' '.join(jieba.cut(txt)) + mask = np.array(Image.open("7.png")) # 使用图片作为遮罩 + wc = wordcloud.WordCloud( + background_color='white', + font_path='msyh.ttc', + colormap='Blues', + mask=mask, + stopwords = stopwords + ) + wc.generate(string) + wc.to_file('词云图.png') + print("词云图生成成功!") + + +def get_excel(): + """统计弹幕出现次数并导出Excel""" + with open('弹幕文本.txt', 'r', encoding='utf-8') as file: + lines = file.readlines() + + # 去除换行符后统计出现次数 + line_counts = pd.Series([line.strip() for line in lines]).value_counts() + df = pd.DataFrame({'弹幕': line_counts.index, '出现次数': line_counts.values}) + df.to_excel('结果统计.xlsx', index=False) + + +def analyze_ai_application(): + """分析AI相关关键词并导出Excel""" + with open('弹幕文本.txt', encoding='utf8') as file: + text = file.read() + ai_keywords = ["智能装备", "数据分析", "虚拟训练", "人工智能", "AR", "3D分析", "自动裁判", "ai","科学","自动","芯片","算法","智能","建模"] + keyword_counts = {kw: text.count(kw) for kw in ai_keywords} + df = pd.DataFrame(sorted(keyword_counts.items(), key=lambda x: x[1], reverse=True), columns=['关键词', '出现次数']) + df.to_excel('关键词统计.xlsx', index=False) + + +if __name__ == '__main__': + get_search('2024巴黎奥运会', max_videos=300) + get_wordcloud() + get_excel() + analyze_ai_application()