|
|
import requests
|
|
|
import re
|
|
|
import time
|
|
|
import pandas as pd # 导入pandas库,用于数据处理和Excel文件操作
|
|
|
from wordcloud import WordCloud
|
|
|
import matplotlib.pyplot as plt
|
|
|
from snownlp import SnowNLP
|
|
|
|
|
|
# 爬取B站视频的bv号
|
|
|
def get_bv_and_cid(keyword, limit=300):
|
|
|
headers = {
|
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) ..." # 设置请求头,模拟浏览器访问
|
|
|
}
|
|
|
bvid_set = set() # 使用集合来避免重复的bv号
|
|
|
for page in range(1, 15): # 分页爬取
|
|
|
if len(bvid_set) >= limit:
|
|
|
break
|
|
|
url = f"https://search.bilibili.com/video?keyword={keyword}&page={page}&page_size=30"
|
|
|
try:
|
|
|
response = requests.get(url, headers=headers)
|
|
|
response.raise_for_status() # 确保请求成功
|
|
|
data = response.text
|
|
|
bvid_set.update(re.findall('href="//www.bilibili.com/video/(.*?)/"', data)) # 提取bv号
|
|
|
except requests.RequestException as e:
|
|
|
print(f"请求错误: {e}")
|
|
|
bvid_list = list(bvid_set)[:limit] # 转换为列表并限制数量
|
|
|
print("成功爬取bv号,共", len(bvid_list), "个")
|
|
|
return bvid_list
|
|
|
|
|
|
# 根据bv号爬取对应的弹幕
|
|
|
def get_danmu(bvid_list):
|
|
|
danmu_data = [] # 存储弹幕数据
|
|
|
headers = {"User-Agent": "..."} # 请求头
|
|
|
for bv in bvid_list:
|
|
|
url = f'https://www.bilibili.com/video/{bv}/' # 构造视频页面URL
|
|
|
response = requests.get(url=url, headers=headers)
|
|
|
response.encoding = 'utf-8'
|
|
|
oids = re.findall('"embedPlayer":{"p":.*?,"aid":.*?,"bvid":".*?","cid":(.*?),', response.text)
|
|
|
print(oids)
|
|
|
#爬取弹幕
|
|
|
for cid in oids:
|
|
|
url = f"https://comment.bilibili.com/{cid}.xml"
|
|
|
response = requests.get(url=url, headers=headers)
|
|
|
response.encoding = 'utf-8'
|
|
|
danmu_list = re.findall('<d p=".*?">(.*?)</d>', response.text) # 提取弹幕
|
|
|
print(danmu_list)
|
|
|
danmu_data.extend(danmu_list)
|
|
|
time.sleep(0.5) # 设置延时以避免过快地发送请求
|
|
|
|
|
|
return danmu_data
|
|
|
|
|
|
# 保存弹幕数据到文本文件和Excel文件
|
|
|
def save_danmu(danmu_list, filename_txt="danmu.txt", filename_xlsx="danmu.xlsx"):
|
|
|
with open(filename_txt, "w", encoding="utf-8") as file:
|
|
|
for danmu in danmu_list:
|
|
|
file.write(danmu + "\n")
|
|
|
df = pd.DataFrame(danmu_list, columns=['弹幕内容'])
|
|
|
df.to_excel(filename_xlsx, index=False, engine='openpyxl')
|
|
|
|
|
|
# 读取Excel文件并筛选包含特定关键词的弹幕
|
|
|
def read_and_filter_danmu(file_path):
|
|
|
df = pd.read_excel(file_path, engine='openpyxl')
|
|
|
keywords = ['AI', 'AI技术', 'AI技术应用', '人工智能应用', '人工智能', '机器学习', '生成']
|
|
|
filtered_danmu = df[df['弹幕内容'].str.contains('|'.join(keywords), na=False, case=False)]
|
|
|
return filtered_danmu
|
|
|
|
|
|
# 对筛选后的弹幕进行计数和排序,获取最常见的弹幕
|
|
|
def count_and_sort_danmu(danmu_df):
|
|
|
danmu_counts = danmu_df['弹幕内容'].value_counts()
|
|
|
top_danmu = danmu_counts.head(8)
|
|
|
return top_danmu
|
|
|
|
|
|
# 保存最常见的弹幕到Excel文件
|
|
|
def save_top_danmu(top_danmu, output_file):
|
|
|
top_danmu_df = pd.DataFrame(top_danmu).reset_index()
|
|
|
top_danmu_df.columns = ['弹幕内容', '数量']
|
|
|
top_danmu_df.to_excel(output_file, index=False, engine='openpyxl')
|
|
|
print(f"Top danmu saved to {output_file}")
|
|
|
|
|
|
# 生成词云图片
|
|
|
def generate_wordcloud(danmu_list, output_image_path):
|
|
|
text = ' '.join(danmu_list)
|
|
|
wordcloud = WordCloud(font_path='msyh.ttc', background_color='white', width=800, height=600, min_font_size=10)
|
|
|
wordcloud.generate(text)
|
|
|
plt.figure(figsize=(10, 8))
|
|
|
plt.imshow(wordcloud, interpolation='bilinear')
|
|
|
plt.axis('off')
|
|
|
plt.savefig(output_image_path, dpi=300)
|
|
|
plt.show()
|
|
|
|
|
|
# 使用snownlp库进行情感分析
|
|
|
def analyze_sentiment(danmu_list):
|
|
|
sentiment_results = []
|
|
|
for danmu in danmu_list:
|
|
|
s = SnowNLP(danmu)
|
|
|
sentiment_score = s.sentiments
|
|
|
sentiment_results.append(sentiment_score)
|
|
|
return sentiment_results
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
keyword = "2024巴黎奥运会"
|
|
|
bvid_list = get_bv_and_cid(keyword)
|
|
|
danmu_list = get_danmu(bvid_list)
|
|
|
save_danmu(danmu_list)
|
|
|
|
|
|
file_path = 'danmu.xlsx'
|
|
|
filtered_danmu = read_and_filter_danmu(file_path)
|
|
|
top_danmu = count_and_sort_danmu(filtered_danmu)
|
|
|
|
|
|
output_file = 'top_danmu.xlsx'
|
|
|
save_top_danmu(top_danmu, output_file)
|
|
|
|
|
|
file_path = 'top_danmu.xlsx'
|
|
|
top_danmu_df = pd.read_excel(file_path, engine='openpyxl')
|
|
|
|
|
|
top_list = top_danmu_df['弹幕内容'].tolist()
|
|
|
if not top_list:
|
|
|
print("筛选后的弹幕为空,跳过生成词云的步骤。")
|
|
|
else:
|
|
|
output_image_path = 'wordcloud.png'
|
|
|
generate_wordcloud(top_list, output_image_path)
|
|
|
sentiments = analyze_sentiment(top_list)
|
|
|
print("情感分析结果:", sentiments) |