From e02c904fd6c28f51c8eed67483f2e6f1a4fddb44 Mon Sep 17 00:00:00 2001 From: ptlqmxye5 <2946992557@qq.com> Date: Tue, 17 Sep 2024 20:36:06 +0800 Subject: [PATCH] ADD search file --- bilibili_search.py | 238 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 238 insertions(+) create mode 100644 bilibili_search.py diff --git a/bilibili_search.py b/bilibili_search.py new file mode 100644 index 0000000..e69ed1c --- /dev/null +++ b/bilibili_search.py @@ -0,0 +1,238 @@ +import requests +import json +from bs4 import BeautifulSoup +from collections import Counter +import pandas as pd +from wordcloud import WordCloud +import matplotlib.pyplot as plt +import time +import jieba + +# B站搜索API URL +search_url = 'https://api.bilibili.com/x/web-interface/wbi/search/type' + +# B站视频详情API URL,用于获取视频的cid +video_info_url = 'https://api.bilibili.com/x/web-interface/view' + +# B站弹幕API URL +danmu_url = 'https://api.bilibili.com/x/v1/dm/list.so' + +def search_bilibili(query, total_results): + num_per_page = 42 # 每页最大视频数 + pages_needed = (total_results + num_per_page - 1) // num_per_page # 计算需要多少页 + video_list = [] + + for page in range(1, pages_needed + 1): + params = { + '__refresh__': 'true', + '_extra': '', + 'context': '', + 'page_size': num_per_page, + 'from_source': '', + 'from_spmid': '333.337', + 'platform': 'pc', + 'highlight': '1', + 'single_column': '0', + 'keyword': query, + 'qv_id': '0EnOHi82F62j2usODhMghThN7EvXEZmj', + 'source_tag': '3', + 'dynamic_offset': 30, + 'search_type': 'video', + 'w_rid': '16f27d62ff40f1a5f935a6af26432c81', + 'wts': '1726306000', + 'page': page # 设置页码 + } + headers = { + 'accept': 'application/json, text/plain, */*', + 'accept-encoding': 'gzip, deflate, br, zstd', + 'accept-language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7,en-GB;q=0.6', + 'cookie': 'DedeUserID=1075156086; DedeUserID__ckMd5=7460241d769e1da4; buvid4=9980B4C0-302E-C6A9-122A-0EFE06E4B5F435899-022102715-X83v1qigvaWQdhtSeo%2BvYQ%3D%3D; enable_web_push=DISABLE; buvid3=0DD4B4A8-5B28-59F0-F5EB-9EB31F483AF226299infoc; b_nut=1699086426; _uuid=1FCED779-E59E-F3CA-81A8-817C10CCF3105C25422infoc; header_theme_version=CLOSE; PVID=1; buvid_fp=395bc05f8612d5e47df093ecc1b2bd8e; rpdid=|(J|)Y)JlmJJ0J\'u~|~m|lJ|Y; CURRENT_FNVAL=4048; CURRENT_QUALITY=80; FEED_LIVE_VERSION=V_HEADER_LIVE_NO_POP; home_feed_column=5; browser_resolution=1528-738; bsource=search_bing; bp_t_offset_1075156086=976968501354823680; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY3MTY5MzMsImlhdCI6MTcyNjQ1NzY3MywicGx0IjotMX0.7WQjSxEb__Z8q6mXZZVKcYfGj_p_EP-8VkK9httVQQA; bili_ticket_expires=1726716873; b_lsid=A255A8C5_191FF65B3BE; SESSDATA=0e66c2c1%2C1742120673%2Cd251f%2A92CjClS9jPOjTyWfjKmoc1Qved4Vfi9N1Jb4KXprWc3-K-qETxsCKQP47sEElvDz-dK0kSVjNHZTNRUUhDSS1DUUJfVzQ3VlQ2NW44YktqbmpLN2hSR2VGQUVIajlfMFAxeERvWlhlWEQ5M1FkX2gxV19FT2wwYjNIcWMwVVRTcElteFpLbkZvRnBRIIEC; bili_jct=0409648e28f719911ffba1058edc4d6d; sid=gq4mtedj', + 'origin': 'https://search.bilibili.com', + 'referer': 'https://search.bilibili.com/all', + 'sec-ch-ua': '"Chromium";v="128", "Not;A=Brand";v="24", "Microsoft Edge";v="128"', + 'sec-ch-ua-mobile': '?0', + 'sec-ch-ua-platform': '"Windows"', + 'sec-fetch-dest': 'empty', + 'sec-fetch-mode': 'cors', + 'sec-fetch-site': 'same-site', + 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0' + } + + response = requests.get(search_url, params=params, headers=headers) + print(f"Page {page} HTTP Status Code: {response.status_code}") + + if response.status_code == 412: + print("请求被阻止,等待1秒重试...") + time.sleep(1) + continue + + try: + data = response.json() + print(f"Page {page} Parsed JSON Data:") + print(data) + except json.JSONDecodeError: + print(f"Page {page} 无法解析 JSON 数据") + continue + + if data['code'] != 0: + print(f"Page {page} Failed to fetch data from Bilibili API") + continue + + videos = data['data']['result'] + for video in videos: + video_id = video['bvid'] + video_list.append(video_id) + if len(video_list) >= total_results: + break + + if len(video_list) >= total_results: + break + + return video_list + + +def get_video_cid(bvid): + # 请求视频的详情信息,获取cid + params = { + 'bvid': bvid + } + headers = { + 'accept': 'application/json, text/plain, */*', + 'accept-encoding': 'gzip', + 'accept-language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7,en-GB;q=0.6', + 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0' + } + + response = requests.get(video_info_url, params=params, headers=headers) + print(f"Video Info HTTP Status Code: {response.status_code}") + + if response.status_code != 200: + print(f"Failed to fetch video info for {bvid}") + return None + + try: + data = response.json() + if 'cid' in data['data']: + return data['data']['cid'] + else: + print(f"CID not found for video {bvid}") + return None + except json.JSONDecodeError: + print("无法解析视频信息的 JSON 数据") + return None + + +def fetch_danmu(cid): + params = { + 'oid': cid + } + headers = { + 'accept': 'application/xml, text/xml, */*', + 'accept-encoding': 'gzip', + 'accept-language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7,en-GB;q=0.6', + 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0' + } + + response = requests.get(danmu_url, params=params, headers=headers) + print(f"Danmu HTTP Status Code: {response.status_code}") + + if response.status_code != 200: + print(f"Failed to fetch danmu for CID {cid}") + return [] + + content = response.content.decode('utf-8') + print("Danmu Response Content:") + print(content) + + soup = BeautifulSoup(content, 'xml') + danmu_texts = [d.text for d in soup.find_all('d')] + + return danmu_texts + + +def count_and_rank_danmu(danmu_texts): + ai_keywords = ['人工智能', '机器学习', '深度学习', '自然语言处理', '计算机视觉', '智能算法', '大数据', 'AI', '智能制造', '智能家居', '智能医疗', '物联网', '云计算', '智能服务', '自动化','ai','机器人'] + top_n = 8 + + # 统计每种弹幕的频率 + counter = Counter(danmu_texts) + + # 统计与 AI 技术应用相关的弹幕频率 + ai_counter = Counter() + keyword_counter = Counter() + + for text, count in counter.items(): + # 统计 AI 关键词的出现次数 + for keyword in ai_keywords: + if keyword in text: + ai_counter[text] += count + keyword_counter[keyword] += count + + # 排名前 top_n 的弹幕 + ranked_ai_danmu = ai_counter.most_common(top_n) + + # 输出每种 AI 关键词的出现次数 + print("AI 技术应用关键词的出现次数:") + for keyword, count in keyword_counter.items(): + print(f"{keyword}: {count} 次") + + # 输出排名前 top_n 的弹幕及其频率 + print(f"\n排名前 {top_n} 的 AI 技术应用弹幕:") + for text, count in ranked_ai_danmu: + print(f"弹幕: {text} - 频率: {count} 次") + + # 将统计结果导出到 Excel + export_to_excel(ranked_ai_danmu, keyword_counter) + + +def export_to_excel(ranked_ai_danmu, keyword_counter): + # 创建 DataFrame,不进行分词,保持原始弹幕 + df_danmu = pd.DataFrame(ranked_ai_danmu, columns=['弹幕', '频率']) + df_keywords = pd.DataFrame(keyword_counter.items(), columns=['关键词', '出现次数']) + + # 保存到 Excel 文件 + with pd.ExcelWriter('danmu_statistics.xlsx') as writer: + df_danmu.to_excel(writer, sheet_name='AI 技术应用弹幕', index=False) + df_keywords.to_excel(writer, sheet_name='AI 技术关键词', index=False) + + print("统计结果已导出到 danmu_statistics.xlsx") + + # 在生成词云图时进行分词 + generate_wordcloud(df_danmu) + + +def generate_wordcloud(df_danmu): + # 进行分词 + processed_texts = [] + for text in df_danmu['弹幕']: + words = jieba.cut(text) # 使用 jieba 分词 + processed_texts.append(' '.join(words)) # 分词结果拼接为字符串 + + # 创建词云图的文本数据 + text = ' '.join(processed_texts) + + # 生成词云图 + wordcloud = WordCloud(font_path='simhei.ttf', width=800, height=600, background_color='white').generate(text) + + # 保存词云图 + wordcloud.generate(text) + wordcloud.to_file('词云.png') + + +def main(): + query = '2024巴黎奥运会' + total_results = 300 # 设定要爬取的总视频数量 + video_list = search_bilibili(query, total_results) + + all_danmu_texts = [] + for bvid in video_list: + cid = get_video_cid(bvid) + if cid: + danmu_texts = fetch_danmu(cid) + all_danmu_texts.extend(danmu_texts) + + count_and_rank_danmu(all_danmu_texts) + + +if __name__ == '__main__': + main()