import requests import re import xml.etree.ElementTree as ET from DrissionPage import ChromiumPage from bs4 import BeautifulSoup from collections import OrderedDict from collections import Counter import pandas as pd from PIL import Image import numpy as np import jieba from wordcloud import WordCloud import matplotlib.pyplot as plt headers = { # cookie用户信息:用于检测用户是否登录账号 "cookie": "buvid4=B1349383-F2A6-E4E5-ED2C-A5B428CC0ED955473-022061918-9v4qmUz9VFkZrXyhSPgawQ%3D%3D; buvid_fp_plain=undefined; LIVE_BUVID=AUTO2216562263398747; is-2022-channel=1; CURRENT_FNVAL=4048; DedeUserID=406267828; DedeUserID__ckMd5=a7899a47ba8a07ab; enable_web_push=DISABLE; rpdid=|(u))kkYu||u0J'u~|JkJR)ul; FEED_LIVE_VERSION=V_WATCHLATER_PIP_WINDOW3; CURRENT_QUALITY=80; buvid3=3D0F285C-7560-9E88-C05A-1AFF4D9CAD7617643infoc; b_nut=1718944717; _uuid=BAAADDA5-4DDD-184B-DB310-EC4105D97D51E17974infoc; header_theme_version=CLOSE; fingerprint=5d019e2bddbce042baa44b97851bb2fd; buvid_fp=5d019e2bddbce042baa44b97851bb2fd; PVID=3; SESSDATA=e97fd178%2C1741840358%2C6c71f%2A91CjB7QCyKSGsn3CjY0C0HRTQF_8a0AYz9r9QLq2FL2YiXLfQFehx5MdXHV3EDcePi4mwSVlM3TVE3LWcyczhLbXoycnhOTWtwa2g5OEl4RVdxOVRQZjljWGJ5d2czbFZpTkVhd3hfRnFmYTE4TVRISTlkdXFpemd1SmhOejNnVm1mYlJ0UDZEcTNBIIEC; bili_jct=4b351ffc7724b0bb10c029efdfad7f75; sid=6jnrbe6l; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY1NDc1NzEsImlhdCI6MTcyNjI4ODMxMSwicGx0IjotMX0.l-PQPoCUFvsMgRpDwqUKwUY9jLLjI-p-HZ1Qaq7AIjI; bili_ticket_expires=1726547511; b_lsid=1564BAD10_191F00CC884; bp_t_offset_406267828=976996173829111808; home_feed_column=5; browser_resolution=2048-1030", # user-agent用户代理:表示浏览器设备的基本信息 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0" } def get_video_details(search_url): response = requests.get(search_url, headers=headers) # 确保请求成功 response.raise_for_status() soup = BeautifulSoup(response.text, 'html.parser') # 匹配 BV 号 bv_pattern = re.compile(r'BV[1-9A-Za-z]{10}') bv_l=[] for tag in soup.find_all('a', href=True): if 'BV' in tag['href']: bv = bv_pattern.search(tag['href']) bv_l.append(bv.group()) bv_l = list(OrderedDict.fromkeys(bv_l))# 去重 return bv_l def get_video_info(bv_id): """ 获取视频信息并解析CID """ url = f'https://api.bilibili.com/x/web-interface/view?bvid={bv_id}' response = requests.get(url, headers=headers) response.encoding = 'utf-8' response.raise_for_status() data = response.json() cid = data['data']['cid'] return cid def get_danmaku_xml(cid): """ 根据CID获取弹幕XML数据 """ url = f'https://comment.bilibili.com/{cid}.xml' response = requests.get(url, headers=headers) response.encoding = 'utf-8' # 确保请求成功 response.raise_for_status() return response.text def parse_danmaku_xml(xml_str): """ 解析弹幕XML数据 """ root = ET.fromstring(xml_str) danmaku_list = [d.text for d in root.findall('.//d') if d.text] return danmaku_list def save_danmaku_to_excel(danmaku_content, file_name='danmaku_content.xlsx'): """ 将弹幕内容保存到 Excel 文件中 :param danmaku_content: 包含弹幕内容的列表 :param file_name: Excel 文件名 """ # 检查是否已经存在文件 try: # 尝试读取现有文件 existing_df = pd.read_excel(file_name,engine='openpyxl') except FileNotFoundError: # 文件不存在,则创建新的 DataFrame existing_df = pd.DataFrame(columns=['content']) # 将新弹幕内容转换为 DataFrame,并自动生成 'id' new_df = pd.DataFrame(danmaku_content, columns=['content']) # new_df['id'] = range(len(existing_df), len(existing_df) + len(new_df)) # 合并现有内容和新内容 updated_df = pd.concat([existing_df, new_df], ignore_index=True) # 保存到 Excel 文件 updated_df.to_excel(file_name, index=False) def savef(video_url): """ 主函数,获取并展示弹幕数据 """ # 提取BV号 bv_id = re.search(r'BV\w+', video_url).group(0) # 获取视频CID cid = get_video_info(bv_id) # print(f"提取的CID: {cid}") # 获取弹幕数据 xml_data = get_danmaku_xml(cid) danmakus = parse_danmaku_xml(xml_data) save_danmaku_to_excel(danmakus, file_name='danmaku_content.xlsx') # # 保存并打印弹幕 # for i, danmaku in enumerate(danmakus): # # 遍历爬取的弹幕内容并逐条保存 # # print(f"弹幕 {i + 1}: {danmaku}") # file_name = 'danmaku_content.xlsx' # print(f"弹幕内容已保存到 {file_name}") # 网址 driver = ChromiumPage() url = "https://search.bilibili.com/all?keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=3&page=2&o=36" driver.get('https://search.bilibili.com/all?keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=3&page=2&o=36') j=0 for page in range(8): bv_list = get_video_details(url) for bv in bv_list: if j==300: break j+=1 # 函数入口,定义函数需要调用 video_url = f'https://www.bilibili.com/video/{bv}/?spm_id_from=333.337' # 调用保存的函数 print(f"v{j}") savef(video_url) # 点击下一页 driver.ele('css:vui_button vui_pagenation--btn vui_pagenation--btn-side').click() # 读取xlsx文件 df = pd.read_excel('danmaku_content.xlsx',engine='openpyxl') # 假设弹幕数据在名为'danmaku'的列中 danmaku_column = df['content'] def filter_ai_danmaku(danmaku_series, keywords): ai_danmaku = [] for danmaku in danmaku_series: if pd.notna(danmaku): # 检查弹幕是否为空 danmaku_str = str(danmaku) # 转换为字符串 if any(keyword in danmaku_str for keyword in keywords): ai_danmaku.append(danmaku_str) return ai_danmaku ai_keywords = ["AI", "AI技术", "机器学习", "深度学习", "智能", "VR/AR", "全景直播", "360度", "3D", "追踪" ,"虚拟", "数字", "人工智能", "面部识别", "云技术", "安保", "检测", "监测", "福州大学"] # 筛选与AI相关的弹幕 filtered_danmaku = filter_ai_danmaku(danmaku_column, ai_keywords) # print(len(filtered_danmaku)) # 统计每种弹幕的数量 danmaku_count = Counter(filtered_danmaku) # 排序并获取前n项 sorted_danmaku = danmaku_count.most_common(15) # print(sorted_danmaku) # 输出排名前n的弹幕及数量 for i, (danmaku, count) in enumerate(sorted_danmaku, 1): print(f"排名 {i}: {danmaku} - 数量: {count}") # 将数据转化为DataFrame danmaku_df = pd.DataFrame(sorted_danmaku, columns=['弹幕内容', '数量']) # 导出到Excel文件 danmaku_df.to_excel('ai_danmaku_statistics.xlsx', index=False, engine='openpyxl') def generate_beautiful_wordcloud(file_name='ai_danmaku_statistics.xlsx', mask_image_path=None,font_path=r"C:\Users\wyk93\Desktop\苹方黑体-准-简.ttf"): # 读取 Excel 文件 df = pd.read_excel(file_name) # 合并所有弹幕内容为一个大字符串 text = ' '.join(df['弹幕内容'].dropna().astype(str)) # 使用 jieba 进行分词 seg_list = jieba.cut(text, cut_all=True) segmented_text = " ".join(seg_list) # # 打印合并后的文本以检查内容 # print("Text content:") # print(text[:500]) # 打印前500个字符以检查 # 读取自定义形状图片 if mask_image_path: mask_image = np.array(Image.open(mask_image_path).convert('L')) mask_image[mask_image > 200] = 255 mask_image[mask_image <= 200] = 0 else: mask_image = None background_image = Image.open(mask_image_path) # 创建词云对象 wordcloud = WordCloud( background_color='white',# 透明背景 mask=mask_image, contour_color=None, contour_width=2, width=2000, # 词云图的宽度 height=2000, # 词云图的高度 colormap='viridis' , # 使用 'viridis' 颜色映射 max_words=20000, # min_font_size=10, # max_font_size=100, # prefer_horizontal=0.9, # 更高的水平优先级 font_path=font_path # 如果需要显示中文,需要指定中文字体路径 ).generate(segmented_text) # 这里替换为你自己的文本数据 # 显示词云图 plt.figure(figsize=(10, 8)) plt.imshow(background_image, interpolation='bilinear') plt.imshow(wordcloud, interpolation='bilinear', alpha=0.4) # 设置透明度 plt.axis('off') # 不显示坐标轴 plt.show() # 调用函数生成词云 # 可选: 使用自定义形状的掩膜图像 generate_beautiful_wordcloud(mask_image_path=r"C:\Users\wyk93\Desktop\图片素材\sucai3.jpg")