From e1168b03d0708b6e80a235bd30019885ff95e93b Mon Sep 17 00:00:00 2001 From: ptlqmxye5 <2946992557@qq.com> Date: Tue, 17 Sep 2024 00:07:12 +0800 Subject: [PATCH] Delete '000.py' --- 000.py | 240 --------------------------------------------------------- 1 file changed, 240 deletions(-) delete mode 100644 000.py diff --git a/000.py b/000.py deleted file mode 100644 index 50e1bf0..0000000 --- a/000.py +++ /dev/null @@ -1,240 +0,0 @@ -import requests -import json -from bs4 import BeautifulSoup -from collections import Counter -import pandas as pd -from wordcloud import WordCloud -import matplotlib.pyplot as plt -import time -import jieba - -# B站搜索API URL -search_url = 'https://api.bilibili.com/x/web-interface/wbi/search/type' - -# B站视频详情API URL,用于获取视频的cid -video_info_url = 'https://api.bilibili.com/x/web-interface/view' - -# B站弹幕API URL -danmu_url = 'https://api.bilibili.com/x/v1/dm/list.so' - - -def search_bilibili(query, total_results): - num_per_page = 42 # 每页最大视频数 - pages_needed = (total_results + num_per_page - 1) // num_per_page # 计算需要多少页 - video_list = [] - - for page in range(1, pages_needed + 1): - params = { - '__refresh__': 'true', - '_extra': '', - 'context': '', - 'page_size': num_per_page, - 'from_source': '', - 'from_spmid': '333.337', - 'platform': 'pc', - 'highlight': '1', - 'single_column': '0', - 'keyword': query, - 'qv_id': '0EnOHi82F62j2usODhMghThN7EvXEZmj', - 'source_tag': '3', - 'dynamic_offset': 30, - 'search_type': 'video', - 'w_rid': '16f27d62ff40f1a5f935a6af26432c81', - 'wts': '1726306000', - 'page': page # 设置页码 - } - headers = { - 'accept': 'application/json, text/plain, */*', - 'accept-encoding': 'gzip, deflate, br, zstd', - 'accept-language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7,en-GB;q=0.6', - 'cookie': 'DedeUserID=1075156086; DedeUserID__ckMd5=7460241d769e1da4; buvid4=9980B4C0-302E-C6A9-122A-0EFE06E4B5F435899-022102715-X83v1qigvaWQdhtSeo%2BvYQ%3D%3D; enable_web_push=DISABLE; buvid3=0DD4B4A8-5B28-59F0-F5EB-9EB31F483AF226299infoc; b_nut=1699086426; _uuid=1FCED779-E59E-F3CA-81A8-817C10CCF3105C25422infoc; header_theme_version=CLOSE; PVID=1; buvid_fp=395bc05f8612d5e47df093ecc1b2bd8e; rpdid=|(J|)Y)JlmJJ0J\'u~|~m|lJ|Y; CURRENT_FNVAL=4048; CURRENT_QUALITY=80; FEED_LIVE_VERSION=V_HEADER_LIVE_NO_POP; SESSDATA=45375f4b%2C1741793139%2Ccfd7b%2A92CjCDJGeSRO5decbO4E62OzjFfKGaAkW2xZVJMSoyMiINCOFoCUntmY4_rMzO8gFMJzYSVkNDTFpKa0h6TWEyS3NNMm5oUkluSnc2OGsySll6MUVBWGFJbmdmZ1VST3pWOWFiek92cnk4S3BhZVRpNFgtLWxacE5iZ3NlX29DdGx3dUJLZWJja253IIEC; bili_jct=f86ff49e42fb369eb9cfa114fa804019; sid=4qsj12c6; home_feed_column=5; browser_resolution=1528-738; bsource=search_bing; bp_t_offset_1075156086=976968501354823680; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY3MTY5MzMsImlhdCI6MTcyNjQ1NzY3MywicGx0IjotMX0.7WQjSxEb__Z8q6mXZZVKcYfGj_p_EP-8VkK9httVQQA; bili_ticket_expires=1726716873; b_lsid=8197BB1F_191FAD926B7', - 'origin': 'https://search.bilibili.com', - 'referer': 'https://search.bilibili.com/all', - 'sec-ch-ua': '"Chromium";v="128", "Not;A=Brand";v="24", "Microsoft Edge";v="128"', - 'sec-ch-ua-mobile': '?0', - 'sec-ch-ua-platform': '"Windows"', - 'sec-fetch-dest': 'empty', - 'sec-fetch-mode': 'cors', - 'sec-fetch-site': 'same-site', - 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0' - } - - response = requests.get(search_url, params=params, headers=headers) - print(f"Page {page} HTTP Status Code: {response.status_code}") - - if response.status_code == 412: - print("请求被阻止,等待1秒重试...") - time.sleep(1) - continue - - try: - data = response.json() - print(f"Page {page} Parsed JSON Data:") - print(data) - except json.JSONDecodeError: - print(f"Page {page} 无法解析 JSON 数据") - continue - - if data['code'] != 0: - print(f"Page {page} Failed to fetch data from Bilibili API") - continue - - videos = data['data']['result'] - for video in videos: - video_id = video['bvid'] - video_list.append(video_id) - if len(video_list) >= total_results: - break - - if len(video_list) >= total_results: - break - - return video_list - - -def get_video_cid(bvid): - # 请求视频的详情信息,获取cid - params = { - 'bvid': bvid - } - headers = { - 'accept': 'application/json, text/plain, */*', - 'accept-encoding': 'gzip', - 'accept-language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7,en-GB;q=0.6', - 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0' - } - - response = requests.get(video_info_url, params=params, headers=headers) - print(f"Video Info HTTP Status Code: {response.status_code}") - - if response.status_code != 200: - print(f"Failed to fetch video info for {bvid}") - return None - - try: - data = response.json() - if 'cid' in data['data']: - return data['data']['cid'] - else: - print(f"CID not found for video {bvid}") - return None - except json.JSONDecodeError: - print("无法解析视频信息的 JSON 数据") - return None - - -def fetch_danmu(cid): - params = { - 'oid': cid - } - headers = { - 'accept': 'application/xml, text/xml, */*', - 'accept-encoding': 'gzip', - 'accept-language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7,en-GB;q=0.6', - 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0' - } - - response = requests.get(danmu_url, params=params, headers=headers) - print(f"Danmu HTTP Status Code: {response.status_code}") - - if response.status_code != 200: - print(f"Failed to fetch danmu for CID {cid}") - return [] - - content = response.content.decode('utf-8') - print("Danmu Response Content:") - print(content) - - - soup = BeautifulSoup(content, 'xml') - danmu_texts = [d.text for d in soup.find_all('d')] - - return danmu_texts - - -def count_and_rank_danmu(danmu_texts): - ai_keywords = ['人工智能', '机器学习', '深度学习', '自然语言处理', '计算机视觉', '智能算法', '大数据', 'AI','智能制造','智能家居','智能医疗','物联网','云计算','智能服务','自动化'] - top_n = 8 - - # 统计每种弹幕的频率 - counter = Counter(danmu_texts) - - # 统计与 AI 技术应用相关的弹幕频率 - ai_counter = Counter() - keyword_counter = Counter() - - for text, count in counter.items(): - # 统计 AI 关键词的出现次数 - for keyword in ai_keywords: - if keyword in text: - ai_counter[text] += count - keyword_counter[keyword] += count - - # 排名前 top_n 的弹幕 - ranked_ai_danmu = ai_counter.most_common(top_n) - - # 输出每种 AI 关键词的出现次数 - print("AI 技术应用关键词的出现次数:") - for keyword, count in keyword_counter.items(): - print(f"{keyword}: {count} 次") - - # 输出排名前 top_n 的弹幕及其频率 - print(f"\n排名前 {top_n} 的 AI 技术应用弹幕:") - for text, count in ranked_ai_danmu: - print(f"弹幕: {text} - 频率: {count} 次") - - # 将统计结果导出到 Excel - export_to_excel(ranked_ai_danmu, keyword_counter) - - -def export_to_excel(ranked_ai_danmu, keyword_counter): - # 创建 DataFrame - df_danmu = pd.DataFrame(ranked_ai_danmu, columns=['弹幕', '频率']) - df_keywords = pd.DataFrame(keyword_counter.items(), columns=['关键词', '出现次数']) - - # 保存到 Excel 文件 - with pd.ExcelWriter('danmu_statistics.xlsx') as writer: - df_danmu.to_excel(writer, sheet_name='AI 技术应用弹幕', index=False) - df_keywords.to_excel(writer, sheet_name='AI 技术关键词', index=False) - - print("统计结果已导出到 danmu_statistics.xlsx") - - # 生成词云图 - generate_wordcloud(df_danmu) - - -def generate_wordcloud(df_danmu): - - # 创建词云图的文本数据uh - text = ' '.join(df_danmu['弹幕']) - - # 生成词云图 - wordcloud = WordCloud(width=800, height=400, background_color='white',font_path='msyh.ttc').generate(text) - - # 显示词云图 - plt.figure(figsize=(10, 5)) - plt.imshow(wordcloud, interpolation='bilinear') - plt.axis('off') - plt.show() - - # 保存词云图到文件 - wordcloud.to_file('ai_danmu_wordcloud.png') - print("词云图已保存到 ai_danmu_wordcloud.png") - - -if __name__ == '__main__': - # 搜索关键词并获取视频列表 - search_keyword = '2024巴黎奥运会' - video_limit = 300 # 设置要获取的视频总数 - video_ids = search_bilibili(search_keyword, video_limit) - - all_danmu = [] - for bvid in video_ids: - cid = get_video_cid(bvid) - if cid: - danmu_texts = fetch_danmu(cid) - all_danmu.extend(danmu_texts) - time.sleep(1) # 减少延时时间 - - # 统计和排名弹幕 - count_and_rank_danmu(all_danmu)