From 623444df8e9bec05533700042aed2a82e9bce9a3 Mon Sep 17 00:00:00 2001 From: pno85vhlq <281710392@qq.com> Date: Wed, 18 Sep 2024 19:22:18 +0800 Subject: [PATCH] ADD file via upload --- 组合.py | 171 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 171 insertions(+) create mode 100644 组合.py diff --git a/组合.py b/组合.py new file mode 100644 index 0000000..575ba18 --- /dev/null +++ b/组合.py @@ -0,0 +1,171 @@ +from audioop import avgpp +import requests +import time +import re, os +import jieba +from wordcloud import WordCloud +from imageio import imread +from collections import Counter +import matplotlib.pyplot as plt +from PIL import Image +import jieba +import numpy as np +#通过cid获取弹幕 +def spider_page(cid): + url = f'http://comment.bilibili.com/{ + cid}.xml' + headers = { + + 'referer': 'xxxxx', + 'User-Agent': 'xxxxx', + 'cookie': "xxxxx" + } + resp = requests.get(url, headers=headers) + resp.encoding = resp.apparent_encoding + print(resp.text) + if resp.status_code == 200: + # 获取所有弹幕内容 + content_list = re.findall('(.*?)', resp.text) + for item in content_list: + with open(comment_file_path, 'a', encoding='utf-8') as fin: + fin.write(item + '\n') + print(item) + print('-------------弹幕获取完毕!-------------') +#查找cid +def extract_cid_number(text): + # 定义正则表达式模式 + pattern = r'cid:(\d+)' + match = re.search(pattern, text) + if match: + # 如果找到匹配项,返回匹配的数字部分 + return match.group(1) # group(1) 表示第一个捕获组 + else: + # 如果没有找到匹配项,返回None + return None +# 爬取300个bv号 +for page in range(1,11): + url = ("https://api.bilibili.com/x/web-interface/wbi/search/type") + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0', + 'cookie':"buvid4=1AD93210-06E5-365A-934E-75A291B6908009721-022061720-eVf9MUhvco5nm9yolfHaMw%3D%3D; buvid_fp_plain=undefined; LIVE_BUVID=AUTO1416573595037363; CURRENT_FNVAL=4048; header_theme_version=CLOSE; enable_web_push=DISABLE; PVID=1; rpdid=|(u))kkYuu|l0J'u~|J~Yu|)J; DedeUserID=355329678; DedeUserID__ckMd5=5e429bd71d91fb47; FEED_LIVE_VERSION=V_WATCHLATER_PIP_WINDOW3; _uuid=6FF33841-3A610-8313-10ED8-3AE107622425A15607infoc; buvid3=3FA03EE9-E5EA-3463-0D0B-073E3807CA0620579infoc; b_nut=1719137316; fingerprint=d1e7fdeb59c00ae7dcb9837e214f09e9; hit-dyn-v2=1; CURRENT_QUALITY=112; buvid_fp=d1e7fdeb59c00ae7dcb9837e214f09e9; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY4MjgwODUsImlhdCI6MTcyNjU2ODgyNSwicGx0IjotMX0.d1aocPfSyHx3EgbchHgcdqtxN94d5619BcEPlXsG5Jw; bili_ticket_expires=1726828025; SESSDATA=8b1a6c86%2C1742204953%2C96da2%2A92CjBnKA1rMd20hx2CampWCl9FHGYIlzAI-IoQMscqxdn8LQwXeAP7HKkOso7RLRlh4CYSVkZQUFBoQ2RzNno5QVRPcEU2LXdrZG9Qd0luRE11STBlbmoyN1l6cl91cTM1Wk9FclU2cDB4NDFnYVNIQ01Fai1qS2dRd0xGLWFIbUNkNXV2elowWGhBIIEC; bili_jct=72a134de4233a184cb823906d856daeb; b_lsid=79C7A3B2_19204B808B2; bsource=search_bing; home_feed_column=5; browser_resolution=1659-836; bp_t_offset_355329678=978470992584114176" + } + params = { + 'category_id': '', + 'search_type': 'video', + 'ad_resource': 5654, + '__refresh__': 'true', + 'context': '', + 'page': page, + 'page_size': 30, + 'pubtime_begin_s': 0, + 'pubtime_end_s': 0, + 'from_source': '', + 'from_spmid': '333.337', + 'platform': 'pc', + 'highlight': 1, + 'single_column': 0, + 'keyword': '2024巴黎奥运会', + 'qv_id': 's5p9ZoGL8W8aU7TP3gWJ1xLzPA6njovt', + 'source_tag': 3, + 'gaia_vtoken': '', + 'dynamic_offset': 24, + 'web_location': 1430654, + 'w_rid': '5475898fdbb1cc8a359f93dd5826e3f9', + 'wts': 1726221162 + } + response = requests.get(url,headers=headers, params=params) + print(response.text) + it = re.finditer(r'"bvid":"(BV[\d\w]{10})"',response.text) + with open('bvnumbers.txt', 'a', encoding='utf-8') as file: + for i in it: + bv = i.group(1) + file.write(bv + '\n') # 每个BV号独占一行 + time.sleep(2) + response.close() +comment_file_path = 'B站弹幕.csv' +output_file = 'cids.txt'#cid的存储文件 +api_base_url="https://api.bilibili.com/x/player/pagelist?bvid=" +headers = { + 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.1.4322; MS-RTC LM 8; InfoPath.2; Tablet PC 2.0)', + 'cookie':"buvid4=1AD93210-06E5-365A-934E-75A291B6908009721-022061720-eVf9MUhvco5nm9yolfHaMw%3D%3D; buvid_fp_plain=undefined; LIVE_BUVID=AUTO1416573595037363; CURRENT_FNVAL=4048; header_theme_version=CLOSE; enable_web_push=DISABLE; PVID=1; rpdid=|(u))kkYuu|l0J'u~|J~Yu|)J; DedeUserID=355329678; DedeUserID__ckMd5=5e429bd71d91fb47; FEED_LIVE_VERSION=V_WATCHLATER_PIP_WINDOW3; _uuid=6FF33841-3A610-8313-10ED8-3AE107622425A15607infoc; buvid3=3FA03EE9-E5EA-3463-0D0B-073E3807CA0620579infoc; b_nut=1719137316; fingerprint=d1e7fdeb59c00ae7dcb9837e214f09e9; hit-dyn-v2=1; CURRENT_QUALITY=112; buvid_fp=d1e7fdeb59c00ae7dcb9837e214f09e9; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY4MjgwODUsImlhdCI6MTcyNjU2ODgyNSwicGx0IjotMX0.d1aocPfSyHx3EgbchHgcdqtxN94d5619BcEPlXsG5Jw; bili_ticket_expires=1726828025; SESSDATA=8b1a6c86%2C1742204953%2C96da2%2A92CjBnKA1rMd20hx2CampWCl9FHGYIlzAI-IoQMscqxdn8LQwXeAP7HKkOso7RLRlh4CYSVkZQUFBoQ2RzNno5QVRPcEU2LXdrZG9Qd0luRE11STBlbmoyN1l6cl91cTM1Wk9FclU2cDB4NDFnYVNIQ01Fai1qS2dRd0xGLWFIbUNkNXV2elowWGhBIIEC; bili_jct=72a134de4233a184cb823906d856daeb; bp_t_offset_355329678=978467990401974272; b_lsid=79C7A3B2_19204B808B2; bsource=search_bing; home_feed_column=5; browser_resolution=1659-836" +} +with open('extracted_bv_numbers.txt', 'r', encoding='utf-8') as file: + bvnumbers = file.read().splitlines() +with open(output_file, 'w', encoding='utf-8') as file: + pass +with open(output_file, 'a', encoding='utf-8') as file: + for bvid in bvnumbers:#循环获取弹幕 + api_url = f"{api_base_url}{bvid}" + try: + response = requests.get(api_url,headers=headers) + if response.status_code == 200: + data = response.json() + # 确保数据结构是预期的形式 + if 'data' in data and isinstance(data['data'], list): + for page in data['data']: + cid = page.get('cid') + if cid is not None: + spider_page(cid) + f = open('B站弹幕.csv', encoding='utf-8') + f.close() + else: + print(f"请求失败,状态码: {response.status_code}") + time.sleep(1) + except Exception as e: + print(f"处理BV号 {bvid} 时发生错误: {e}") + time.sleep(1)#休息一下 +print("所有BV号已处理完毕。") +# 定义与2024年巴黎奥运会赛事应用AI技术相关的关键词 +ai_olympic_keywords = [ + "巴黎奥运会AI", "AI技术", "智能", "子弹时间", "AI增强", "赛事转播", "辅助训练", "AI回放系统", + "3d模型", "沉浸式虚拟重建","AI重塑","运动捕捉","特效渲染","AI修复","AI","人工智能" +] +# 字典存储包含关键词的弹幕及其出现次数 +danmu_counts = Counter() +output_file_path = '包含关键词的弹幕.txt' +input_file_path = 'B站弹幕.csv' +# 打开弹幕文件并读取内容 +with open('B站弹幕.csv', mode='r', encoding='utf-8') as file: + # 遍历每一行数据 + for line in file: + danmu = line.strip() # 去除行尾的换行符 + # 检查弹幕是否包含任何一个关键词 + for keyword in ai_olympic_keywords: + if keyword in danmu: + danmu_counts[danmu] += 1 + break # 一旦找到关键词,就跳过剩余的关键词检查 +# 获取出现次数最多的前8条弹幕 +# 打开输入文件以读取内容,打开输出文件以写入内容 +with open(input_file_path, mode='r', encoding='utf-8') as input_file, \ + open(output_file_path, mode='w', encoding='utf-8') as output_file: + # 遍历输入文件的每一行 + for line in input_file: + danmu = line.strip() # 去除行尾的换行符 + # 检查弹幕是否包含任何一个关键词 + if any(keyword in danmu for keyword in ai_olympic_keywords): + # 如果包含,就写入输出文件 + output_file.write(danmu + '\n') # 添加换行符以便每条弹幕占一行 +top_8_danmus = danmu_counts.most_common(8) +# 将结果保存到文件 +with open('top_ai_olympic_danmus.txt', 'w', encoding='utf-8') as output: + for danmu, count in top_8_danmus: + output.write(f"{danmu}\n") +f = open('包含关键词的弹幕.txt',encoding='utf-8') +text = f.read() +text_list = jieba.lcut(text) +print(text_list) +text_str = ''.join(text_list) +print(text_str) +#生成词云图 +wc = WordCloud( + width=2000, + height=1000, + background_color='white', + font_path='C:\Windows\Fonts\SIMLI.TTF', +) +wc.generate(text_str) +wc.to_file('ciyun.png') +# 输出结果 +print("出现次数最多的前8条关于2024巴黎奥运会赛事应用AI技术的弹幕:") +for danmu, count in top_8_danmus: + print(f"'{danmu}' 出现了 {count} 次")