|
|
|
@ -2,10 +2,6 @@ import re
|
|
|
|
|
import requests
|
|
|
|
|
from multiprocessing.dummy import Pool
|
|
|
|
|
from tqdm import tqdm
|
|
|
|
|
import pandas as pd
|
|
|
|
|
from collections import Counter
|
|
|
|
|
from wordcloud import WordCloud
|
|
|
|
|
import matplotlib.pyplot as plt
|
|
|
|
|
|
|
|
|
|
# 配置常量
|
|
|
|
|
KEYWORD = "2024 巴黎奥运会"
|
|
|
|
@ -37,7 +33,7 @@ def get_bvs(html: str) -> list:
|
|
|
|
|
return re.findall(r'bvid:"([^"]+)"', html)
|
|
|
|
|
|
|
|
|
|
def get_info(vid: str) -> dict:
|
|
|
|
|
"""获取视频信息"""
|
|
|
|
|
"""获取视频信息,如标题和弹幕数"""
|
|
|
|
|
url = f"https://api.bilibili.com/x/web-interface/view/detail?bvid={vid}"
|
|
|
|
|
try:
|
|
|
|
|
response = requests.get(url)
|
|
|
|
@ -47,6 +43,8 @@ def get_info(vid: str) -> dict:
|
|
|
|
|
if 'data' in data:
|
|
|
|
|
info = {
|
|
|
|
|
"标题": data["data"]["View"]["title"],
|
|
|
|
|
"总弹幕数": data["data"]["View"]["stat"]["danmaku"],
|
|
|
|
|
"视频数量": data["data"]["View"]["videos"],
|
|
|
|
|
"cid": [dic["cid"] for dic in data["data"]["View"]["pages"]]
|
|
|
|
|
}
|
|
|
|
|
return info
|
|
|
|
@ -71,10 +69,13 @@ def get_danmu(info: dict) -> list:
|
|
|
|
|
print(f"获取弹幕{len(all_dms)}条!")
|
|
|
|
|
return all_dms
|
|
|
|
|
|
|
|
|
|
def save_danmu(bv: str, danmu_data: list):
|
|
|
|
|
"""将弹幕保存到文本文件和Excel中"""
|
|
|
|
|
df = pd.DataFrame(danmu_data, columns=['弹幕'])
|
|
|
|
|
df.to_excel(f"./{KEYWORD}弹幕.xlsx", index=False, mode='a', header=not pd.io.common.file_exists(f"./{KEYWORD}弹幕.xlsx"))
|
|
|
|
|
def save_danmu(bv: str):
|
|
|
|
|
"""将弹幕保存到文本文件"""
|
|
|
|
|
info = get_info(bv)
|
|
|
|
|
danmu = get_danmu(info)
|
|
|
|
|
with open(f"./{KEYWORD}弹幕.txt", "a", encoding="utf-8") as fout:
|
|
|
|
|
for dm in danmu:
|
|
|
|
|
fout.write(dm + "\n")
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
|
"""主函数:爬取视频信息和弹幕"""
|
|
|
|
@ -88,44 +89,10 @@ def main():
|
|
|
|
|
# 限制为前三百个视频
|
|
|
|
|
bvs = bvs[:300]
|
|
|
|
|
|
|
|
|
|
all_danmu = []
|
|
|
|
|
|
|
|
|
|
# 爬取弹幕
|
|
|
|
|
for bv in tqdm(bvs, desc="正在爬取弹幕"):
|
|
|
|
|
info = get_info(bv继续完成上述Python代码,确保我们可以爬取B站弹幕、保存到Excel文件,并生成词云图。
|
|
|
|
|
|
|
|
|
|
### 继续的代码
|
|
|
|
|
|
|
|
|
|
```python
|
|
|
|
|
if info:
|
|
|
|
|
danmu = get_danmu(info)
|
|
|
|
|
all_danmu.extend(danmu)
|
|
|
|
|
|
|
|
|
|
# 统计AI相关弹幕数量
|
|
|
|
|
counter = Counter(all_danmu)
|
|
|
|
|
top_danmu = counter.most_common(8)
|
|
|
|
|
|
|
|
|
|
# 输出前8的弹幕
|
|
|
|
|
print("AI相关弹幕统计(数量排名前8):")
|
|
|
|
|
for text, count in top_danmu:
|
|
|
|
|
print(f"{text}: {count}")
|
|
|
|
|
|
|
|
|
|
# 将弹幕数据写入Excel
|
|
|
|
|
save_danmu(KEYWORD, all_danmu)
|
|
|
|
|
|
|
|
|
|
# 生成词云图
|
|
|
|
|
generate_wordcloud(all_danmu)
|
|
|
|
|
|
|
|
|
|
def generate_wordcloud(danmu_data):
|
|
|
|
|
"""生成弹幕的词云图"""
|
|
|
|
|
text = " ".join(danmu_data)
|
|
|
|
|
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
|
|
|
|
|
|
|
|
|
|
plt.figure(figsize=(10, 5))
|
|
|
|
|
plt.imshow(wordcloud, interpolation='bilinear')
|
|
|
|
|
plt.axis('off')
|
|
|
|
|
plt.title("弹幕词云图")
|
|
|
|
|
plt.show()
|
|
|
|
|
save_danmu(bv)
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
main()
|
|
|
|
|
|