You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

123 lines
4.8 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import requests
import re
import time
import pandas as pd # 导入pandas库用于数据处理和Excel文件操作
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from snownlp import SnowNLP
# 爬取B站视频的bv号
def get_bv_and_cid(keyword, limit=300):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) ..." # 设置请求头,模拟浏览器访问
}
bvid_set = set() # 使用集合来避免重复的bv号
for page in range(1, 15): # 分页爬取
if len(bvid_set) >= limit:
break
url = f"https://search.bilibili.com/video?keyword={keyword}&page={page}&page_size=30"
try:
response = requests.get(url, headers=headers)
response.raise_for_status() # 确保请求成功
data = response.text
bvid_set.update(re.findall('href="//www.bilibili.com/video/(.*?)/"', data)) # 提取bv号
except requests.RequestException as e:
print(f"请求错误: {e}")
bvid_list = list(bvid_set)[:limit] # 转换为列表并限制数量
print("成功爬取bv号,共", len(bvid_list), "")
return bvid_list
# 根据bv号爬取对应的弹幕
def get_danmu(bvid_list):
danmu_data = [] # 存储弹幕数据
headers = {"User-Agent": "..."} # 请求头
for bv in bvid_list:
url = f'https://www.bilibili.com/video/{bv}/' # 构造视频页面URL
response = requests.get(url=url, headers=headers)
response.encoding = 'utf-8'
oids = re.findall('"embedPlayer":{"p":.*?,"aid":.*?,"bvid":".*?","cid":(.*?),', response.text)
print(oids)
#爬取弹幕
for cid in oids:
url = f"https://comment.bilibili.com/{cid}.xml"
response = requests.get(url=url, headers=headers)
response.encoding = 'utf-8'
danmu_list = re.findall('<d p=".*?">(.*?)</d>', response.text) # 提取弹幕
print(danmu_list)
danmu_data.extend(danmu_list)
time.sleep(0.5) # 设置延时以避免过快地发送请求
return danmu_data
# 保存弹幕数据到文本文件和Excel文件
def save_danmu(danmu_list, filename_txt="danmu.txt", filename_xlsx="danmu.xlsx"):
with open(filename_txt, "w", encoding="utf-8") as file:
for danmu in danmu_list:
file.write(danmu + "\n")
df = pd.DataFrame(danmu_list, columns=['弹幕内容'])
df.to_excel(filename_xlsx, index=False, engine='openpyxl')
# 读取Excel文件并筛选包含特定关键词的弹幕
def read_and_filter_danmu(file_path):
df = pd.read_excel(file_path, engine='openpyxl')
keywords = ['AI', 'AI技术', 'AI技术应用', '人工智能应用', '人工智能', '机器学习', '生成']
filtered_danmu = df[df['弹幕内容'].str.contains('|'.join(keywords), na=False, case=False)]
return filtered_danmu
# 对筛选后的弹幕进行计数和排序,获取最常见的弹幕
def count_and_sort_danmu(danmu_df):
danmu_counts = danmu_df['弹幕内容'].value_counts()
top_danmu = danmu_counts.head(8)
return top_danmu
# 保存最常见的弹幕到Excel文件
def save_top_danmu(top_danmu, output_file):
top_danmu_df = pd.DataFrame(top_danmu).reset_index()
top_danmu_df.columns = ['弹幕内容', '数量']
top_danmu_df.to_excel(output_file, index=False, engine='openpyxl')
print(f"Top danmu saved to {output_file}")
# 生成词云图片
def generate_wordcloud(danmu_list, output_image_path):
text = ' '.join(danmu_list)
wordcloud = WordCloud(font_path='msyh.ttc', background_color='white', width=800, height=600, min_font_size=10)
wordcloud.generate(text)
plt.figure(figsize=(10, 8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.savefig(output_image_path, dpi=300)
plt.show()
# 使用snownlp库进行情感分析
def analyze_sentiment(danmu_list):
sentiment_results = []
for danmu in danmu_list:
s = SnowNLP(danmu)
sentiment_score = s.sentiments
sentiment_results.append(sentiment_score)
return sentiment_results
if __name__ == "__main__":
keyword = "2024巴黎奥运会"
bvid_list = get_bv_and_cid(keyword)
danmu_list = get_danmu(bvid_list)
save_danmu(danmu_list)
file_path = 'danmu.xlsx'
filtered_danmu = read_and_filter_danmu(file_path)
top_danmu = count_and_sort_danmu(filtered_danmu)
output_file = 'top_danmu.xlsx'
save_top_danmu(top_danmu, output_file)
file_path = 'top_danmu.xlsx'
top_danmu_df = pd.read_excel(file_path, engine='openpyxl')
top_list = top_danmu_df['弹幕内容'].tolist()
if not top_list:
print("筛选后的弹幕为空,跳过生成词云的步骤。")
else:
output_image_path = 'wordcloud.png'
generate_wordcloud(top_list, output_image_path)
sentiments = analyze_sentiment(top_list)
print("情感分析结果:", sentiments)