requirements.txt

main
pzrxqba79 2 months ago
parent 1ec2ae842e
commit cb85d3ba9d

@ -1,130 +1,5 @@
import re requests
import requests pandas
from multiprocessing.dummy import Pool tqdm
from tqdm import tqdm wordcloud
import pandas as pd matplotlib
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
# 配置常量
KEYWORD = "2024 巴黎奥运会"
DANMU_KEYWORD = "AI" # 过滤弹幕中的关键字
PAGENUM = 10 # 设置要爬取的页面数量
WORKERS = 6 # 线程池工作线程数
# HTTP请求头部
HEADERS = {
"cookie": "your_cookie_here", # 替换为实际cookie
'origin': 'https://www.bilibili.com',
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"referer": "https://t.bilibili.com/?spm_id_from=333.337.0.0",
}
def get_search_results_html(page: int) -> str:
"""获取搜索结果页面的HTML内容"""
url = f"https://search.bilibili.com/all?keyword={KEYWORD}&order=click&page={page}"
try:
response = requests.get(url, headers=HEADERS)
response.raise_for_status()
return response.text
except requests.RequestException as e:
print(f"Error fetching page {page}: {e}")
return ""
def get_bvs(html: str) -> list:
"""从HTML内容中提取BVs"""
return re.findall(r'bvid:"([^"]+)"', html)
def get_info(vid: str) -> dict:
"""获取视频信息"""
url = f"https://api.bilibili.com/x/web-interface/view/detail?bvid={vid}"
try:
response = requests.get(url)
response.raise_for_status()
data = response.json()
if 'data' in data:
info = {
"标题": data["data"]["View"]["title"],
"cid": [dic["cid"] for dic in data["data"]["View"]["pages"]]
}
return info
except requests.RequestException as e:
print(f"Error fetching info for vid {vid}: {e}")
return {}
def get_danmu(info: dict) -> list:
"""获取视频的弹幕"""
all_dms = []
for cid in info.get("cid", []):
url = f"https://api.bilibili.com/x/v1/dm/list.so?oid={cid}"
try:
response = requests.get(url)
response.encoding = "utf-8"
data = re.findall('<d p="(.*?)">(.*?)</d>', response.text)
dms = [d[1] for d in data if DANMU_KEYWORD in d[1]] # 过滤包含AI的弹幕
all_dms += dms
except requests.RequestException as e:
print(f"Error fetching danmu for cid {cid}: {e}")
print(f"获取弹幕{len(all_dms)}条!")
return all_dms
def save_danmu(bv: str, danmu_data: list):
"""将弹幕保存到文本文件和Excel中"""
df = pd.DataFrame(danmu_data, columns=['弹幕'])
df.to_excel(f"./{KEYWORD}弹幕.xlsx", index=False, mode='a', header=not pd.io.common.file_exists(f"./{KEYWORD}弹幕.xlsx"))
def main():
"""主函数:爬取视频信息和弹幕"""
pool = Pool(WORKERS)
htmls = pool.map(get_search_results_html, range(1, PAGENUM + 1))
bvs = []
for html in htmls:
bvs.extend(get_bvs(html))
# 限制为前三百个视频
bvs = bvs[:300]
all_danmu = []
# 爬取弹幕
for bv in tqdm(bvs, desc="正在爬取弹幕"):
info = get_info(bv继续完成上述Python代码确保我们可以爬取B站弹幕、保存到Excel文件并生成词云图。
if info:
danmu = get_danmu(info)
all_danmu.extend(danmu)
# 统计AI相关弹幕数量
counter = Counter(all_danmu)
top_danmu = counter.most_common(8)
# 输出前8的弹幕
print("AI相关弹幕统计数量排名前8")
for text, count in top_danmu:
print(f"{text}: {count}")
# 将弹幕数据写入Excel
save_danmu(KEYWORD, all_danmu)
# 生成词云图
generate_wordcloud(all_danmu)
def generate_wordcloud(danmu_data):
"""生成弹幕的词云图"""
text = " ".join(danmu_data)
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title("弹幕词云图")
plt.show()
if __name__ == "__main__":
main()
Loading…
Cancel
Save