diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..4232bd5 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,94 @@ +import requests +from bs4 import BeautifulSoup +import time +import pandas as pd +from collections import Counter +from wordcloud import WordCloud +import matplotlib.pyplot as plt +cnt = 0 #已爬取视频数 +danmuku_all = [] #弹幕库 + +headers = { + user-agent Mozilla5.0 (Windows NT 10.0; Win64; x64) AppleWebKit537.36 (KHTML, like Gecko) Chrome128.0.0.0 Safari537.36 +} +def get_cid(bvid) + url = fhttpsapi.bilibili.comxplayerpagelistbvid={bvid} + try + response = requests.get(url, headers=headers, timeout=10) + response.raise_for_status() + Json = response.json() + return Json['data'][0]['cid'] + except requests.exceptions.RequestException as e + print(f请求失败 {e}) + return None +def get_danmuku(cid) + if cid is None + return [] + url = fhttpscomment.bilibili.com{cid}.xml + try + response = requests.get(url, headers=headers, timeout=10) + response.encoding = 'utf-8' + soup = BeautifulSoup(response.text, 'xml') + return [i.text for i in soup.find_all('d')] + except requests.exceptions.RequestException as e + print(f请求失败 {e}) + return [] +for Page in range(1, 22) + url = f'httpsapi.bilibili.comxweb-interfacesearchtypesearch_type=video&keyword=巴黎奥运会&page={Page}' + try + response = requests.get(url, headers=headers, timeout=10) + response.raise_for_status() + Json = response.json() + results = Json['data']['result'] + for result in results + cid = get_cid(result['bvid']) + danmuku = get_danmuku(cid) + danmuku_all.extend(danmuku) + cnt += 1 + if cnt = 300 + break + if cnt = 300 + break + except requests.exceptions.RequestException as e + print(f请求失败 {e}) +time.sleep(1) #延时1秒防止被屏蔽 +def filter_danmuku(danmuku_list, keywords) + #筛选包含指定关键词的弹幕 + keywords_lower = [keyword.lower() for keyword in keywords] # 关键词小写 + filtered = [d for d in danmuku_list if any(keyword in d.lower() for keyword in keywords_lower)] + return filtered + +#读取弹幕文件 +with open('所有视频弹幕.txt', 'r', encoding='utf-8') as file + danmuku_all = file.readlines() + +#筛选包含关键词的弹幕 +keywords = ['AI识曲','AI生成','神经网络','卷积神经网络','循环神经网络','智能家居','自动驾驶','智能推荐','智能算法','强化学习','计算机视觉','ai还原','ai合成'] +filtered_danmuku = filter_danmuku(danmuku_all, keywords) +#统计弹幕数量 +counter = Counter(filtered_danmuku) +most_common = counter.most_common(8) +#将结果按列写入Excel +data = {'弹幕内容' [content.strip() for content, count in most_common], + '数量' [count for content, count in most_common]} +df = pd.DataFrame(data) +df.to_excel('AI_人工智能_弹幕统计.xlsx', index=False) +print(前8位弹幕统计已保存到 'AI_人工智能_弹幕统计.xlsx'.) +font_path = r'CWindowsFontssimhei.ttf' +try + df = pd.read_excel('AI_人工智能_弹幕统计.xlsx') + if '弹幕内容' not in df.columns + raise ValueError(Excel 文件中没有找到 '弹幕内容' 列) + text = ' '.join(df['弹幕内容'].dropna()) + wordcloud = WordCloud(font_path=font_path, width=800, height=400, background_color='white').generate(text) + plt.figure(figsize=(10, 5)) + plt.imshow(wordcloud, interpolation='bilinear') + plt.axis('off') + plt.show() + wordcloud.to_file('词云图.png') +except FileNotFoundError + print(文件未找到,请检查文件路径) +except ValueError as ve + print(ve) +except Exception as e + print(f发生错误 {e}) \ No newline at end of file