diff --git a/README.md b/README.md index 7aa0f72..7348e99 100644 --- a/README.md +++ b/README.md @@ -1,130 +1,5 @@ -import re -import requests -from multiprocessing.dummy import Pool -from tqdm import tqdm -import pandas as pd -from collections import Counter -from wordcloud import WordCloud -import matplotlib.pyplot as plt - -# 配置常量 -KEYWORD = "2024 巴黎奥运会" -DANMU_KEYWORD = "AI" # 过滤弹幕中的关键字 -PAGENUM = 10 # 设置要爬取的页面数量 -WORKERS = 6 # 线程池工作线程数 - -# HTTP请求头部 -HEADERS = { - "cookie": "your_cookie_here", # 替换为实际cookie - 'origin': 'https://www.bilibili.com', - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", - "referer": "https://t.bilibili.com/?spm_id_from=333.337.0.0", -} - -def get_search_results_html(page: int) -> str: - """获取搜索结果页面的HTML内容""" - url = f"https://search.bilibili.com/all?keyword={KEYWORD}&order=click&page={page}" - try: - response = requests.get(url, headers=HEADERS) - response.raise_for_status() - return response.text - except requests.RequestException as e: - print(f"Error fetching page {page}: {e}") - return "" - -def get_bvs(html: str) -> list: - """从HTML内容中提取BVs""" - return re.findall(r'bvid:"([^"]+)"', html) - -def get_info(vid: str) -> dict: - """获取视频信息""" - url = f"https://api.bilibili.com/x/web-interface/view/detail?bvid={vid}" - try: - response = requests.get(url) - response.raise_for_status() - data = response.json() - - if 'data' in data: - info = { - "标题": data["data"]["View"]["title"], - "cid": [dic["cid"] for dic in data["data"]["View"]["pages"]] - } - return info - except requests.RequestException as e: - print(f"Error fetching info for vid {vid}: {e}") - return {} - -def get_danmu(info: dict) -> list: - """获取视频的弹幕""" - all_dms = [] - for cid in info.get("cid", []): - url = f"https://api.bilibili.com/x/v1/dm/list.so?oid={cid}" - try: - response = requests.get(url) - response.encoding = "utf-8" - data = re.findall('(.*?)', response.text) - dms = [d[1] for d in data if DANMU_KEYWORD in d[1]] # 过滤包含AI的弹幕 - all_dms += dms - except requests.RequestException as e: - print(f"Error fetching danmu for cid {cid}: {e}") - - print(f"获取弹幕{len(all_dms)}条!") - return all_dms - -def save_danmu(bv: str, danmu_data: list): - """将弹幕保存到文本文件和Excel中""" - df = pd.DataFrame(danmu_data, columns=['弹幕']) - df.to_excel(f"./{KEYWORD}弹幕.xlsx", index=False, mode='a', header=not pd.io.common.file_exists(f"./{KEYWORD}弹幕.xlsx")) - -def main(): - """主函数:爬取视频信息和弹幕""" - pool = Pool(WORKERS) - htmls = pool.map(get_search_results_html, range(1, PAGENUM + 1)) - bvs = [] - - for html in htmls: - bvs.extend(get_bvs(html)) - - # 限制为前三百个视频 - bvs = bvs[:300] - - all_danmu = [] - - # 爬取弹幕 - for bv in tqdm(bvs, desc="正在爬取弹幕"): - info = get_info(bv继续完成上述Python代码,确保我们可以爬取B站弹幕、保存到Excel文件,并生成词云图。 - - if info: - danmu = get_danmu(info) - all_danmu.extend(danmu) - - # 统计AI相关弹幕数量 - counter = Counter(all_danmu) - top_danmu = counter.most_common(8) - - # 输出前8的弹幕 - print("AI相关弹幕统计(数量排名前8):") - for text, count in top_danmu: - print(f"{text}: {count}") - - # 将弹幕数据写入Excel - save_danmu(KEYWORD, all_danmu) - - # 生成词云图 - generate_wordcloud(all_danmu) - -def generate_wordcloud(danmu_data): - """生成弹幕的词云图""" - text = " ".join(danmu_data) - wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text) - - plt.figure(figsize=(10, 5)) - plt.imshow(wordcloud, interpolation='bilinear') - plt.axis('off') - plt.title("弹幕词云图") - plt.show() - -if __name__ == "__main__": - main() - - +requests +pandas +tqdm +wordcloud +matplotlib \ No newline at end of file