requirements.txt

main
pzrxqba79 2 months ago
parent 1ec2ae842e
commit cb85d3ba9d

@ -1,130 +1,5 @@
import re
import requests
from multiprocessing.dummy import Pool
from tqdm import tqdm
import pandas as pd
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
# 配置常量
KEYWORD = "2024 巴黎奥运会"
DANMU_KEYWORD = "AI" # 过滤弹幕中的关键字
PAGENUM = 10 # 设置要爬取的页面数量
WORKERS = 6 # 线程池工作线程数
# HTTP请求头部
HEADERS = {
"cookie": "your_cookie_here", # 替换为实际cookie
'origin': 'https://www.bilibili.com',
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"referer": "https://t.bilibili.com/?spm_id_from=333.337.0.0",
}
def get_search_results_html(page: int) -> str:
"""获取搜索结果页面的HTML内容"""
url = f"https://search.bilibili.com/all?keyword={KEYWORD}&order=click&page={page}"
try:
response = requests.get(url, headers=HEADERS)
response.raise_for_status()
return response.text
except requests.RequestException as e:
print(f"Error fetching page {page}: {e}")
return ""
def get_bvs(html: str) -> list:
"""从HTML内容中提取BVs"""
return re.findall(r'bvid:"([^"]+)"', html)
def get_info(vid: str) -> dict:
"""获取视频信息"""
url = f"https://api.bilibili.com/x/web-interface/view/detail?bvid={vid}"
try:
response = requests.get(url)
response.raise_for_status()
data = response.json()
if 'data' in data:
info = {
"标题": data["data"]["View"]["title"],
"cid": [dic["cid"] for dic in data["data"]["View"]["pages"]]
}
return info
except requests.RequestException as e:
print(f"Error fetching info for vid {vid}: {e}")
return {}
def get_danmu(info: dict) -> list:
"""获取视频的弹幕"""
all_dms = []
for cid in info.get("cid", []):
url = f"https://api.bilibili.com/x/v1/dm/list.so?oid={cid}"
try:
response = requests.get(url)
response.encoding = "utf-8"
data = re.findall('<d p="(.*?)">(.*?)</d>', response.text)
dms = [d[1] for d in data if DANMU_KEYWORD in d[1]] # 过滤包含AI的弹幕
all_dms += dms
except requests.RequestException as e:
print(f"Error fetching danmu for cid {cid}: {e}")
print(f"获取弹幕{len(all_dms)}条!")
return all_dms
def save_danmu(bv: str, danmu_data: list):
"""将弹幕保存到文本文件和Excel中"""
df = pd.DataFrame(danmu_data, columns=['弹幕'])
df.to_excel(f"./{KEYWORD}弹幕.xlsx", index=False, mode='a', header=not pd.io.common.file_exists(f"./{KEYWORD}弹幕.xlsx"))
def main():
"""主函数:爬取视频信息和弹幕"""
pool = Pool(WORKERS)
htmls = pool.map(get_search_results_html, range(1, PAGENUM + 1))
bvs = []
for html in htmls:
bvs.extend(get_bvs(html))
# 限制为前三百个视频
bvs = bvs[:300]
all_danmu = []
# 爬取弹幕
for bv in tqdm(bvs, desc="正在爬取弹幕"):
info = get_info(bv继续完成上述Python代码确保我们可以爬取B站弹幕、保存到Excel文件并生成词云图。
if info:
danmu = get_danmu(info)
all_danmu.extend(danmu)
# 统计AI相关弹幕数量
counter = Counter(all_danmu)
top_danmu = counter.most_common(8)
# 输出前8的弹幕
print("AI相关弹幕统计数量排名前8")
for text, count in top_danmu:
print(f"{text}: {count}")
# 将弹幕数据写入Excel
save_danmu(KEYWORD, all_danmu)
# 生成词云图
generate_wordcloud(all_danmu)
def generate_wordcloud(danmu_data):
"""生成弹幕的词云图"""
text = " ".join(danmu_data)
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title("弹幕词云图")
plt.show()
if __name__ == "__main__":
main()
requests
pandas
tqdm
wordcloud
matplotlib
Loading…
Cancel
Save