Update README.md

10 months ago · 413b7721e7
parent 18da276714
commit 413b7721e7
1 changed files with 97 additions and 1 deletions
--- a/README.md
+++ b/README.md
@ -1,2 +1,98 @@
-# 1.1
+import re
+import requests
+from multiprocessing.dummy import Pool
+from tqdm import tqdm
+
+# 配置常量
+KEYWORD = "2024 巴黎奥运会"
+DANMU_KEYWORD = "AI"  # 过滤弹幕中的关键字
+PAGENUM = 10  # 设置要爬取的页面数量
+WORKERS = 6  # 线程池工作线程数
+
+# HTTP请求头部
+HEADERS = {
+    "cookie": "your_cookie_here",  # 替换为实际cookie
+    'origin': 'https://www.bilibili.com',
+    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
+    "referer": "https://t.bilibili.com/?spm_id_from=333.337.0.0",
+}
+
+def get_search_results_html(page: int) -> str:
+    """获取搜索结果页面的HTML内容"""
+    url = f"https://search.bilibili.com/all?keyword={KEYWORD}&order=click&page={page}"
+    try:
+        response = requests.get(url, headers=HEADERS)
+        response.raise_for_status()
+        return response.text
+    except requests.RequestException as e:
+        print(f"Error fetching page {page}: {e}")
+        return ""
+
+def get_bvs(html: str) -> list:
+    """从HTML内容中提取BVs"""
+    return re.findall(r'bvid:"([^"]+)"', html)
+
+def get_info(vid: str) -> dict:
+    """获取视频信息，如标题和弹幕数"""
+    url = f"https://api.bilibili.com/x/web-interface/view/detail?bvid={vid}"
+    try:
+        response = requests.get(url)
+        response.raise_for_status()
+        data = response.json()
+        
+        if 'data' in data:
+            info = {
+                "标题": data["data"]["View"]["title"],
+                "总弹幕数": data["data"]["View"]["stat"]["danmaku"],
+                "视频数量": data["data"]["View"]["videos"],
+                "cid": [dic["cid"] for dic in data["data"]["View"]["pages"]]
+            }
+            return info
+    except requests.RequestException as e:
+        print(f"Error fetching info for vid {vid}: {e}")
+    return {}
+
+def get_danmu(info: dict) -> list:
+    """获取视频的弹幕"""
+    all_dms = []
+    for cid in info.get("cid", []):
+        url = f"https://api.bilibili.com/x/v1/dm/list.so?oid={cid}"
+        try:
+            response = requests.get(url)
+            response.encoding = "utf-8"
+            data = re.findall('<d p="(.*?)">(.*?)</d>', response.text)
+            dms = [d[1] for d in data if DANMU_KEYWORD in d[1]]  # 过滤包含AI的弹幕
+            all_dms += dms
+        except requests.RequestException as e:
+            print(f"Error fetching danmu for cid {cid}: {e}")
+    
+    print(f"获取弹幕{len(all_dms)}条！")
+    return all_dms
+
+def save_danmu(bv: str):
+    """将弹幕保存到文本文件"""
+    info = get_info(bv)
+    danmu = get_danmu(info)
+    with open(f"./{KEYWORD}弹幕.txt", "a", encoding="utf-8") as fout:
+        for dm in danmu:
+            fout.write(dm + "\n")
+
+def main():
+    """主函数：爬取视频信息和弹幕"""
+    pool = Pool(WORKERS)
+    htmls = pool.map(get_search_results_html, range(1, PAGENUM + 1))
+    bvs = []
+    
+    for html in htmls:
+        bvs.extend(get_bvs(html))
+
+    # 限制为前三百个视频
+    bvs = bvs[:300]
+    
+    # 爬取弹幕
+    for bv in tqdm(bvs, desc="正在爬取弹幕"):
+        save_danmu(bv)
+
+if __name__ == "__main__":
+    main()