From 45497433a489fb33188e0062c52772cd4d278775 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9D=8E=E7=8E=B2?= <1971708054@qq.com>
Date: Wed, 18 Sep 2024 20:54:52 +0800
Subject: [PATCH] =?UTF-8?q?=E5=91=BD=E5=90=8D=E4=BF=AE=E6=94=B9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
release/test_1.py | 77 ---------------
release/test_bilibili_danmu.py | 102 --------------------
release/test_getfor.py | 169 ---------------------------------
3 files changed, 348 deletions(-)
delete mode 100644 release/test_1.py
delete mode 100644 release/test_bilibili_danmu.py
delete mode 100644 release/test_getfor.py
diff --git a/release/test_1.py b/release/test_1.py
deleted file mode 100644
index 8a80c37..0000000
--- a/release/test_1.py
+++ /dev/null
@@ -1,77 +0,0 @@
-import requests
-import re
-from lxml import etree
-import os
-
-class BiliBiliDanMu:
- def __init__(self, bv, filename):
- # 自动处理 BV 号,确保没有重复的 "BV" 前缀
- if bv.startswith("BV"):
- bv = bv[2:]
- # 根据 bv 号构造要爬取的视频 URL 地址
- self.video_url = "https://bilibili.com/video/BV" + bv
- self.filename = filename
- self.headers = {
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)\
- AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36 Edg/85.0.564.44"
- }
-
- # 获取视频的 cid
- def get_video_cid(self):
- response = requests.get(self.video_url, headers=self.headers, timeout=10)
- if response.status_code != 200:
- print(f"请求失败,状态码: {response.status_code}")
- return None
-
- html = response.content.decode()
- print(f"HTML 内容(前500字符): {html[:500]}") # 打印部分 HTML 内容用于调试
- cid = re.findall(r'("cid":)([0-9]+)', html)
- # 有的视频没有这个字段,我们跳过它
- if len(cid) == 0:
- print("未找到 cid")
- return None
- else:
- return cid[0][-1]
-
- # 获取请求弹幕 XML 文件返回的内容
- def get_content(self, xml_url):
- response = requests.get(xml_url, headers=self.headers, timeout=10)
- return response.content
-
- # 解析获取到的内容,得到包含视频所有弹幕的列表
- def extract_danmu(self, content_str):
- html = etree.HTML(content_str)
- danmu_list = html.xpath("//d/text()")
- return danmu_list
-
- # 将弹幕逐行写入并保存为 txt 文件
- def save(self, save_items):
- # 确保输出目录存在
- output_dir = os.path.dirname(self.filename)
- os.makedirs(output_dir, exist_ok=True) # 自动创建目录
-
- with open(self.filename, 'w', encoding='utf-8') as f:
- lines = []
- for item in save_items:
- lines.append(item + '\n')
- f.writelines(lines)
- print(f"弹幕已保存至 {self.filename}")
-
- # 爬虫的过程封装
- def crawl(self):
- cid = self.get_video_cid()
- # 跳过没有 cid 字段的视频
- if cid is not None:
- xml_url = "http://comment.bilibili.com/" + str(cid) + ".xml"
- content_str = self.get_content(xml_url)
- danmu_lst = self.extract_danmu(content_str)
- self.save(danmu_lst)
- else:
- print("视频没有有效的 cid,跳过此视频")
-
-if __name__ == '__main__':
- bv = input("请输入视频的 bv 号: ")
- # 处理文件名,确保路径正确
- filename = 'E:/前端/软件工程/{}.txt'.format(str(bv))
- dm = BiliBiliDanMu(bv, filename)
- dm.crawl()
\ No newline at end of file
diff --git a/release/test_bilibili_danmu.py b/release/test_bilibili_danmu.py
deleted file mode 100644
index c5610e8..0000000
--- a/release/test_bilibili_danmu.py
+++ /dev/null
@@ -1,102 +0,0 @@
-import unittest
-from unittest.mock import patch, MagicMock
-from crawl_getthread import BiliBiliDanMu, search_videos, download_danmu, getthread
-import os
-
-class TestBiliBiliDanMu(unittest.TestCase):
- """
- 测试 BiliBiliDanMu 类的单元测试类。
- """
-
- @patch('crawl_getthread.requests.get')
- def test_get_video_cid_success(self, mock_get):
- """
- 测试根据视频 BV 号获取弹幕 CID 成功。
- """
- mock_get.return_value = MagicMock(status_code=200, text='{"cid":123456}')
- danmu_crawler = BiliBiliDanMu("BV12345", "test_output.txt")
- self.assertEqual(danmu_crawler.get_video_cid(), "123456")
-
- @patch('crawl_getthread.requests.get')
- def test_get_video_cid_failure(self, mock_get):
- """
- 测试根据视频 BV 号获取弹幕 CID 失败。
- """
- mock_get.return_value = MagicMock(status_code=404)
- danmu_crawler = BiliBiliDanMu("BV12345", "test_output.txt")
- self.assertIsNone(danmu_crawler.get_video_cid())
-
- @patch('crawl_getthread.requests.get')
- def test_get_content_success(self, mock_get):
- """
- 测试根据弹幕 XML URL 获取弹幕内容成功。
- """
- mock_get.return_value = MagicMock(status_code=200, text='弹幕内容')
- danmu_crawler = BiliBiliDanMu("BV12345", "test_output.txt")
- self.assertEqual(danmu_crawler.get_content("http://comment.bilibili.com/123456.xml"), '弹幕内容')
-
- @patch('crawl_getthread.requests.get')
- def test_get_content_failure(self, mock_get):
- """
- 测试根据弹幕 XML URL 获取弹幕内容失败。
- """
- mock_get.return_value = MagicMock(status_code=404)
- danmu_crawler = BiliBiliDanMu("BV12345", "test_output.txt")
- self.assertIsNone(danmu_crawler.get_content("http://comment.bilibili.com/123456.xml"))
-
- def test_extract_danmu(self):
- """
- 测试解析弹幕 XML 内容,提取弹幕文本。
- """
- danmu_crawler = BiliBiliDanMu("BV12345", "test_output.txt")
- content_str = '弹幕内容1弹幕内容2'
- self.assertEqual(danmu_crawler.extract_danmu(content_str), ['弹幕内容1', '弹幕内容2'])
-
- def test_save(self):
- """
- 测试保存弹幕到文件。
- """
- danmu_crawler = BiliBiliDanMu("BV12345", "test_output.txt")
- danmu_crawler.save(['弹幕内容1', '弹幕内容2'])
- with open("test_output.txt", 'r', encoding='utf-8') as f:
- self.assertEqual(f.readlines(), ['弹幕内容1\n', '弹幕内容2\n'])
- os.remove("test_output.txt") # 清理测试文件
-
- @patch('crawl_getthread.search_videos')
- def test_search_videos_success(self, mock_search_videos):
- """
- 测试根据关键词搜索视频并成功返回视频 ID 列表。
- """
- mock_search_videos.return_value = ['BV12345', 'BV67890']
- video_ids = search_videos("2024巴黎奥运会", max_results=2)
- self.assertEqual(video_ids, ['BV12345', 'BV67890'])
-
- @patch('crawl_getthread.search_videos')
- def test_search_videos_failure(self, mock_search_videos):
- """
- 测试根据关键词搜索视频时请求失败。
- """
- mock_search_videos.return_value = []
- video_ids = search_videos("2024巴黎奥运会", max_results=2)
- self.assertEqual(video_ids, [])
-
- @patch('crawl_getthread.download_danmu')
- def test_download_danmu(self, mock_download):
- """
- 测试下载指定 BV 号视频的弹幕。
- """
- mock_download.return_value = None
- download_danmu(0, "BV12345", "test_output.txt")
- mock_download.assert_called_with(0, "BV12345", "test_output.txt")
-
- @patch('crawl_getthread.getthread')
- def test_getthread(self, mock_getthread):
- """
- 测试使用线程池并发下载弹幕。
- """
- mock_getthread.return_value = None
- getthread()
- mock_getthread.assert_called_once()
-
-if __name__ == '__main__':
- unittest.main() # 执行测试
\ No newline at end of file
diff --git a/release/test_getfor.py b/release/test_getfor.py
deleted file mode 100644
index 7901efb..0000000
--- a/release/test_getfor.py
+++ /dev/null
@@ -1,169 +0,0 @@
-# import cProfile
-import requests
-import re
-from lxml import etree
-import os
-import time
-import random
-from collections import OrderedDict
-
-class BiliBiliDanMu:
- def __init__(self, bv, filename):
- if bv.startswith("BV"):
- bv = bv[2:]
- self.video_url = "https://bilibili.com/video/BV" + bv
- self.filename = filename
- self.headers = {
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36",
- "Referer": "https://www.bilibili.com/",
- "Accept": "application/json, text/plain, */*",
- "Accept-Language": "zh-CN,zh;q=0.9",
- }
-
- def get_video_cid(self):
- retry_count = 3
- for attempt in range(retry_count):
- try:
- response = requests.get(self.video_url, headers=self.headers, timeout=10)
- if response.status_code != 200:
- print(f"请求失败,状态码: {response.status_code}")
- continue
-
- html = response.content.decode()
- cid = re.findall(r'("cid":)([0-9]+)', html)
- if not cid:
- print("未找到 cid")
- continue
- else:
- return cid[0][-1]
- except requests.exceptions.RequestException as e:
- print(f"获取 cid 时出错: {e}")
- print(f"第 {attempt + 1} 次重试获取 cid...")
- time.sleep(2)
- return None
-
- def get_content(self, xml_url):
- try:
- response = requests.get(xml_url, headers=self.headers, timeout=10)
- if response.status_code == 200:
- return response.content
- else:
- print(f"获取弹幕内容失败,状态码: {response.status_code}")
- return None
- except requests.exceptions.RequestException as e:
- print(f"获取弹幕时出错: {e}")
- return None
-
- def extract_danmu(self, content_str):
- try:
- html = etree.HTML(content_str)
- danmu_list = html.xpath("//d/text()")
- return danmu_list
- except Exception as e:
- print(f"解析弹幕时出错: {e}")
- return []
-
- def save(self, save_items):
- output_dir = os.path.dirname(self.filename)
- os.makedirs(output_dir, exist_ok=True)
-
- with open(self.filename, 'w', encoding='utf-8') as f:
- lines = [item + '\n' for item in save_items]
- f.writelines(lines)
- print(f"弹幕已保存至 {self.filename}")
-
- def crawl(self):
- cid = self.get_video_cid()
- if cid is not None:
- xml_url = "http://comment.bilibili.com/" + str(cid) + ".xml"
- content_str = self.get_content(xml_url)
- if content_str:
- danmu_lst = self.extract_danmu(content_str)
- self.save(danmu_lst)
- else:
- print("视频没有有效的 cid,跳过此视频")
-
-
-def search_videos(query, max_results=350):
- search_url = "https://api.bilibili.com/x/web-interface/search/type"
- headers = {
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36",
- "Referer": "https://www.bilibili.com/",
- "Accept": "application/json, text/plain, */*",
- "Accept-Language": "zh-CN,zh;q=0.9",
- "cookie": "your cookie" # 替换为实际的 Cookie
- }
-
- bv_list = []
- page = 1
-
- while len(bv_list) < max_results:
- params = {
- 'keyword': query,
- 'search_type': 'video',
- 'order': 'totalrank',
- 'page': page,
- 'pagesize': 50
- }
-
- try:
- response = requests.get(search_url, params=params, headers=headers, timeout=10)
- if response.status_code == 200:
- results = response.json()
- if results['code'] == 0:
- videos = results['data']['result']
- if not videos:
- break
- bv_list += [video['bvid'] for video in videos]
- print(f"已抓取 {len(bv_list)} 个视频")
- else:
- print(f"搜索失败,错误代码: {results['code']},错误信息: {results.get('message', '无详细信息')}")
- if '频繁' in results.get('message', ''):
- print("限流,等待后重试")
- time.sleep(random.uniform(5, 10))
- continue
- break
- else:
- print(f"搜索请求失败,状态码: {response.status_code}")
- break
- except requests.exceptions.RequestException as e:
- print(f"请求失败,错误: {e}")
- time.sleep(random.uniform(2, 5))
- continue
-
- page += 1
- time.sleep(random.uniform(1, 3))
-
- bv_list = list(OrderedDict.fromkeys(bv_list)) # 去重操作
- return bv_list[:max_results]
-
-
-def download_danmu(index, bv, filename):
- danmu_crawler = BiliBiliDanMu(bv, filename)
- danmu_crawler.crawl()
-
-def getfor():
- for index, bv in enumerate(bv_list):
- filename = f'{output_dir}第{index + 1}个视频_{bv}.txt'
- print(f"正在抓取 BV号 {bv} 的弹幕...")
- download_danmu(index, bv, filename)
- print(f"BV号 {bv} 的弹幕抓取完成")
-
-
-if __name__ == '__main__':
- query = input("请输入搜索关键词: ")
- bv_list = search_videos(query)
-
- # 限制爬取的最大视频数量为300
- bv_list = bv_list[:300]
-
- output_dir = 'E:/前端/软件工程/弹幕收集first/'
- os.makedirs(output_dir, exist_ok=True)
-
- # 依次抓取每个视频的弹幕
- getfor()
- # for index, bv in enumerate(bv_list):
- # filename = f'{output_dir}第{index + 1}个视频_{bv}.txt'
- # print(f"正在抓取 BV号 {bv} 的弹幕...")
- # download_danmu(index, bv, filename)
- # print(f"BV号 {bv} 的弹幕抓取完成")
\ No newline at end of file