Delete 'crawler.py'

7 months ago · 6d42fc7198
parent 319a6400ce
commit 6d42fc7198
1 changed files with 0 additions and 144 deletions
--- a/crawler.py
+++ b/crawler.py
@ -1,144 +0,0 @@
-import requests
-from bs4 import BeautifulSoup
-import re
-import time
-import random
-
-# 定义一个爬虫类
-class BilibiliCrawler:
-    def __init__(self):
-        # 设置请求头，模拟浏览器访问
-        self.headers = {
-            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
-            'Referer': 'https://www.bilibili.com/'
-        }
-        self.session = requests.Session()
-        self.session.headers.update(self.headers)
-
-    def search_videos(self, keyword, pages=3):
-        """搜索视频，获取视频ID列表"""
-        print("正在搜索视频...")
-        video_ids = []
-        
-        for page in range(1, pages + 1):
-            try:
-                # 构造搜索URL
-                url = f'https://search.bilibili.com/all'
-                params = {
-                    'keyword': keyword,
-                    'page': page,
-                    'order': 'totalrank'  # 综合排序
-                }
-                
-                response = self.session.get(url, params=params, timeout=10)
-                response.encoding = 'utf-8'
-                
-                # 使用BeautifulSoup解析HTML
-                soup = BeautifulSoup(response.text, 'html.parser')
-                
-                # 查找视频链接（改进的选择器）
-                video_links = soup.find_all('a', href=re.compile(r'//www.bilibili.com/video/(BV[0-9A-Za-z]+)'))
-                
-                for link in video_links:
-                    href = link.get('href')
-                    if 'BV' in href:
-                        # 提取视频BV号
-                        bv_match = re.search(r'BV[0-9A-Za-z]+', href)
-                        if bv_match:
-                            video_ids.append(bv_match.group())
-                
-                print(f'第{page}页搜索完成，找到{len(video_links)}个视频')
-                
-                # 随机延时，避免请求过快
-                time.sleep(random.uniform(1, 3))
-                
-            except Exception as e:
-                print(f'搜索第{page}页时出错: {e}')
-                continue
-        
-        # 去重
-        video_ids = list(set(video_ids))
-        print(f'共找到{len(video_ids)}个唯一视频')
-        return video_ids[:20]  # 限制视频数量，避免请求过多
-
-    def get_danmaku(self, bvid):
-        """获取视频的弹幕"""
-        try:
-            # 先获取视频的cid（弹幕ID）
-            video_info_url = f'https://api.bilibili.com/x/web-interface/view'
-            params = {'bvid': bvid}
-            
-            response = self.session.get(video_info_url, params=params, timeout=10)
-            video_info = response.json()
-            
-            if video_info['code'] != 0:
-                print(f"无法获取视频{bvid}信息")
-                return []
-            
-            cid = video_info['data']['cid']
-            
-            # 获取弹幕
-            danmaku_url = f'https://api.bilibili.com/x/v1/dm/list.so'
-            params = {'oid': cid}
-            
-            response = self.session.get(danmaku_url, params=params, timeout=10)
-            response.encoding = 'utf-8'
-            
-            # 解析XML格式的弹幕
-            soup = BeautifulSoup(response.text, 'xml')
-            danmakus = soup.find_all('d')
-            
-            danmaku_list = []
-            for danmaku in danmakus:
-                text = danmaku.get_text()
-                if text and len(text.strip()) > 0:
-                    danmaku_list.append(text.strip())
-            
-            print(f'视频{bvid}获取到{len(danmaku_list)}条弹幕')
-            return danmaku_list
-            
-        except Exception as e:
-            print(f'获取视频{bvid}弹幕时出错: {e}')
-            return []
-
-    def crawl(self, keywords, output_file='danmaku.txt'):
-        """主爬取函数"""
-        all_danmakus = []
-        
-        for keyword in keywords:
-            print(f'\n开始爬取关键词: {keyword}')
-            video_ids = self.search_videos(keyword)
-            
-            for i, bvid in enumerate(video_ids):
-                print(f'正在处理第{i+1}/{len(video_ids)}个视频: {bvid}')
-                danmakus = self.get_danmaku(bvid)
-                all_danmakus.extend(danmakus)
-                
-                # 保存进度
-                with open(output_file, 'w', encoding='utf-8') as f:
-                    for danmaku in all_danmakus:
-                        f.write(danmaku + '\n')
-                
-                # 延时
-                time.sleep(random.uniform(0.5, 1.5))
-        
-        print(f'\n爬取完成！共获取{len(all_danmakus)}条弹幕，已保存到{output_file}')
-        return all_danmakus
-
-# 主程序
-if __name__ == '__main__':
-    crawler = BilibiliCrawler()
-    
-    # 搜索关键词
-    keywords = ['大语言模型', 'LLM', '大模型']
-    
-    # 开始爬取
-    danmakus = crawler.crawl(keywords, 'bilibili_danmaku.txt')
-    
-    # 打印一些统计信息
-    print(f"\n最终统计:")
-    print(f"总弹幕数: {len(danmakus)}")
-    if danmakus:
-        print("前10条弹幕示例:")
-        for i, dm in enumerate(danmakus[:10]):
-            print(f"{i+1}. {dm}")