You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

132 lines
4.1 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import re
import requests
from multiprocessing.dummy import Pool
from tqdm import tqdm
import pandas as pd
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
# 配置常量
KEYWORD = "2024 巴黎奥运会"
DANMU_KEYWORD = "AI" # 过滤弹幕中的关键字
PAGENUM = 10 # 设置要爬取的页面数量
WORKERS = 6 # 线程池工作线程数
# HTTP请求头部
HEADERS = {
"cookie": "your_cookie_here", # 替换为实际cookie
'origin': 'https://www.bilibili.com',
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"referer": "https://t.bilibili.com/?spm_id_from=333.337.0.0",
}
def get_search_results_html(page: int) -> str:
"""获取搜索结果页面的HTML内容"""
url = f"https://search.bilibili.com/all?keyword={KEYWORD}&order=click&page={page}"
try:
response = requests.get(url, headers=HEADERS)
response.raise_for_status()
return response.text
except requests.RequestException as e:
print(f"Error fetching page {page}: {e}")
return ""
def get_bvs(html: str) -> list:
"""从HTML内容中提取BVs"""
return re.findall(r'bvid:"([^"]+)"', html)
def get_info(vid: str) -> dict:
"""获取视频信息"""
url = f"https://api.bilibili.com/x/web-interface/view/detail?bvid={vid}"
try:
response = requests.get(url)
response.raise_for_status()
data = response.json()
if 'data' in data:
info = {
"标题": data["data"]["View"]["title"],
"cid": [dic["cid"] for dic in data["data"]["View"]["pages"]]
}
return info
except requests.RequestException as e:
print(f"Error fetching info for vid {vid}: {e}")
return {}
def get_danmu(info: dict) -> list:
"""获取视频的弹幕"""
all_dms = []
for cid in info.get("cid", []):
url = f"https://api.bilibili.com/x/v1/dm/list.so?oid={cid}"
try:
response = requests.get(url)
response.encoding = "utf-8"
data = re.findall('<d p="(.*?)">(.*?)</d>', response.text)
dms = [d[1] for d in data if DANMU_KEYWORD in d[1]] # 过滤包含AI的弹幕
all_dms += dms
except requests.RequestException as e:
print(f"Error fetching danmu for cid {cid}: {e}")
print(f"获取弹幕{len(all_dms)}条!")
return all_dms
def save_danmu(bv: str, danmu_data: list):
"""将弹幕保存到文本文件和Excel中"""
df = pd.DataFrame(danmu_data, columns=['弹幕'])
df.to_excel(f"./{KEYWORD}弹幕.xlsx", index=False, mode='a', header=not pd.io.common.file_exists(f"./{KEYWORD}弹幕.xlsx"))
def main():
"""主函数:爬取视频信息和弹幕"""
pool = Pool(WORKERS)
htmls = pool.map(get_search_results_html, range(1, PAGENUM + 1))
bvs = []
for html in htmls:
bvs.extend(get_bvs(html))
# 限制为前三百个视频
bvs = bvs[:300]
all_danmu = []
# 爬取弹幕
for bv in tqdm(bvs, desc="正在爬取弹幕"):
info = get_info(bv继续完成上述Python代码确保我们可以爬取B站弹幕、保存到Excel文件并生成词云图。
### 继续的代码
```python
if info:
danmu = get_danmu(info)
all_danmu.extend(danmu)
# 统计AI相关弹幕数量
counter = Counter(all_danmu)
top_danmu = counter.most_common(8)
# 输出前8的弹幕
print("AI相关弹幕统计数量排名前8")
for text, count in top_danmu:
print(f"{text}: {count}")
# 将弹幕数据写入Excel
save_danmu(KEYWORD, all_danmu)
# 生成词云图
generate_wordcloud(all_danmu)
def generate_wordcloud(danmu_data):
"""生成弹幕的词云图"""
text = " ".join(danmu_data)
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title("弹幕词云图")
plt.show()
if __name__ == "__main__":
main()