requirements.txt

11 months ago · cb85d3ba9d
parent 1ec2ae842e
commit cb85d3ba9d
1 changed files with 5 additions and 130 deletions
--- a/README.md
+++ b/README.md
@ -1,130 +1,5 @@
-import re
-import requests
-from multiprocessing.dummy import Pool
-from tqdm import tqdm
-import pandas as pd
-from collections import Counter
-from wordcloud import WordCloud
-import matplotlib.pyplot as plt
-
-# 配置常量
-KEYWORD = "2024 巴黎奥运会"
-DANMU_KEYWORD = "AI"  # 过滤弹幕中的关键字
-PAGENUM = 10  # 设置要爬取的页面数量
-WORKERS = 6  # 线程池工作线程数
-
-# HTTP请求头部
-HEADERS = {
-    "cookie": "your_cookie_here",  # 替换为实际cookie
-    'origin': 'https://www.bilibili.com',
-    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
-    "referer": "https://t.bilibili.com/?spm_id_from=333.337.0.0",
-}
-
-def get_search_results_html(page: int) -> str:
-    """获取搜索结果页面的HTML内容"""
-    url = f"https://search.bilibili.com/all?keyword={KEYWORD}&order=click&page={page}"
-    try:
-        response = requests.get(url, headers=HEADERS)
-        response.raise_for_status()
-        return response.text
-    except requests.RequestException as e:
-        print(f"Error fetching page {page}: {e}")
-        return ""
-
-def get_bvs(html: str) -> list:
-    """从HTML内容中提取BVs"""
-    return re.findall(r'bvid:"([^"]+)"', html)
-
-def get_info(vid: str) -> dict:
-    """获取视频信息"""
-    url = f"https://api.bilibili.com/x/web-interface/view/detail?bvid={vid}"
-    try:
-        response = requests.get(url)
-        response.raise_for_status()
-        data = response.json()
-        
-        if 'data' in data:
-            info = {
-                "标题": data["data"]["View"]["title"],
-                "cid": [dic["cid"] for dic in data["data"]["View"]["pages"]]
-            }
-            return info
-    except requests.RequestException as e:
-        print(f"Error fetching info for vid {vid}: {e}")
-    return {}
-
-def get_danmu(info: dict) -> list:
-    """获取视频的弹幕"""
-    all_dms = []
-    for cid in info.get("cid", []):
-        url = f"https://api.bilibili.com/x/v1/dm/list.so?oid={cid}"
-        try:
-            response = requests.get(url)
-            response.encoding = "utf-8"
-            data = re.findall('<d p="(.*?)">(.*?)</d>', response.text)
-            dms = [d[1] for d in data if DANMU_KEYWORD in d[1]]  # 过滤包含AI的弹幕
-            all_dms += dms
-        except requests.RequestException as e:
-            print(f"Error fetching danmu for cid {cid}: {e}")
-    
-    print(f"获取弹幕{len(all_dms)}条！")
-    return all_dms
-
-def save_danmu(bv: str, danmu_data: list):
-    """将弹幕保存到文本文件和Excel中"""
-    df = pd.DataFrame(danmu_data, columns=['弹幕'])
-    df.to_excel(f"./{KEYWORD}弹幕.xlsx", index=False, mode='a', header=not pd.io.common.file_exists(f"./{KEYWORD}弹幕.xlsx"))
-
-def main():
-    """主函数：爬取视频信息和弹幕"""
-    pool = Pool(WORKERS)
-    htmls = pool.map(get_search_results_html, range(1, PAGENUM + 1))
-    bvs = []
-    
-    for html in htmls:
-        bvs.extend(get_bvs(html))
-
-    # 限制为前三百个视频
-    bvs = bvs[:300]
-    
-    all_danmu = []
-    
-    # 爬取弹幕
-    for bv in tqdm(bvs, desc="正在爬取弹幕"):
-        info = get_info(bv继续完成上述Python代码，确保我们可以爬取B站弹幕、保存到Excel文件，并生成词云图。
-
-        if info:
-            danmu = get_danmu(info)
-            all_danmu.extend(danmu)
-
-    # 统计AI相关弹幕数量
-    counter = Counter(all_danmu)
-    top_danmu = counter.most_common(8)
-
-    # 输出前8的弹幕
-    print("AI相关弹幕统计（数量排名前8）：")
-    for text, count in top_danmu:
-        print(f"{text}: {count}")
-
-    # 将弹幕数据写入Excel
-    save_danmu(KEYWORD, all_danmu)
-
-    # 生成词云图
-    generate_wordcloud(all_danmu)
-
-def generate_wordcloud(danmu_data):
-    """生成弹幕的词云图"""
-    text = " ".join(danmu_data)
-    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
-
-    plt.figure(figsize=(10, 5))
-    plt.imshow(wordcloud, interpolation='bilinear')
-    plt.axis('off')
-    plt.title("弹幕词云图")
-    plt.show()
-
-if __name__ == "__main__":
-    main()
-
-
+requests
+pandas
+tqdm
+wordcloud
+matplotlib