diff --git a/main.py b/main.py deleted file mode 100644 index ff9a13a..0000000 --- a/main.py +++ /dev/null @@ -1,105 +0,0 @@ -import requests -from bs4 import BeautifulSoup -import time -import pandas as pd -from collections import Counter -from wordcloud import WordCloud -import matplotlib.pyplot as plt -cnt = 0 -# 已爬取视频数 -danmuku_all = [] -# 弹幕库 - -headers = { - "cookie": "cookie", - "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36" -} - - -def get_cid(bvid): - url = f"https://api.bilibili.com/x/player/pagelist?bvid={bvid}" - try: - response = requests.get(url, headers=headers, timeout=10) - response.raise_for_status() - Json = response.json() - return Json['data'][0]['cid'] - except requests.exceptions.RequestException as e: - print(f"请求失败: {e}") - return None - - -def get_danmuku(cid): - if cid is None: - return [] - url = f"https://comment.bilibili.com/{cid}.xml" - try: - response = requests.get(url, headers=headers, timeout=10) - response.encoding = 'utf-8' - soup = BeautifulSoup(response.text, 'xml') - return [i.text for i in soup.find_all('d')] - except requests.exceptions.RequestException as e: - print(f"请求失败: {e}") - return [] - - -for Page in range(1, 22): # 1到22页够300个视频 - url = f'https://api.bilibili.com/x/web-interface/search/type?search_type=video&keyword=巴黎奥运会&page={Page}' - try: - response = requests.get(url, headers=headers, timeout=10) - response.raise_for_status() - Json = response.json() - results = Json['data']['result'] - for result in results: - cid = get_cid(result['bvid']) - danmuku = get_danmuku(cid) - danmuku_all.extend(danmuku) - cnt += 1 - if cnt >= 300: - break - if cnt >= 300: - break - except requests.exceptions.RequestException as e: - print(f"请求失败: {e}") -time.sleep(1) # 延时1秒防止被屏蔽 - - -def filter_danmuku(danmuku_list, keywords): - # 筛选包含指定关键词的弹幕 - keywords_lower = [keyword.lower() for keyword in keywords] # 关键词小写 - filtered = [d for d in danmuku_list if any(keyword in d.lower() for keyword in keywords_lower)] - return filtered - -# 读取弹幕文件 -with open('所有视频弹幕.txt', 'r', encoding='utf-8') as file: - danmuku_all = file.readlines() - -# 筛选包含关键词的弹幕 -keywords = ['AI配音' , 'ai配音' , '人工智能' , 'ai画图' , 'AI画图' , 'AI识曲' , 'AI生成' , '神经网络' , '卷积神经网络' , '循环神经网络' , '智能家居' , '自动驾驶' , '智能推荐' , '智能算法' , '强化学习' , '计算机视觉' , 'ai还原' , 'ai合成'] -filtered_danmuku = filter_danmuku(danmuku_all, keywords) -# 统计弹幕数量 -counter = Counter(filtered_danmuku) -most_common = counter.most_common(8) -# 将结果按列写入Excel -data = {'弹幕内容': [content.strip() for content, count in most_common], - '数量': [count for content, count in most_common]} -df = pd.DataFrame(data) -df.to_excel('AI_人工智能_弹幕统计.xlsx', index=False) -print("前8位弹幕统计已保存到 'AI_人工智能_弹幕统计.xlsx'.") -font_path = r'C:\Windows\Fonts\simhei.ttf' -try: - df = pd.read_excel('AI_人工智能_弹幕统计.xlsx') - if '弹幕内容' not in df.columns: - raise ValueError("Excel 文件中没有找到 '弹幕内容' 列") - text = ' '.join(df['弹幕内容'].dropna()) - wordcloud = WordCloud(font_path=font_path, width=800, height=400, background_color='white').generate(text) - plt.figure(figsize=(10, 5)) - plt.imshow(wordcloud, interpolation='bilinear') - plt.axis('off') - plt.show() - wordcloud.to_file('词云图.png') -except FileNotFoundError: - print("文件未找到,请检查文件路径") -except ValueError as ve: - print(ve) -except Exception as e: - print(f"发生错误: {e}") \ No newline at end of file