You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

91 lines
3.4 KiB

# src/crawler.py
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.edge.service import Service
from selenium.webdriver.edge.options import Options
import time
import re
class VideoCrawler:
def __init__(self, driver_path=r"D:\l1\drivers\msedgedriver.exe", headless=True, wait=2):
"""初始化 Edge 驱动"""
options = Options()
if headless:
options.add_argument("--headless=new")
options.add_argument("--disable-gpu")
options.add_argument("--disable-software-rasterizer")
options.add_argument("--no-sandbox")
options.add_argument("--disable-web-security")
options.add_argument("--ignore-certificate-errors")
options.add_argument("--no-proxy-server")
options.add_argument("--log-level=3")
service = Service(driver_path)
self.driver = webdriver.Edge(service=service, options=options)
self.wait = wait
# 🧠 自动识别视频链接函数
def auto_find_videos(self):
"""自动从当前页面识别出视频链接"""
all_links = set()
elements = self.driver.find_elements(By.TAG_NAME, "a")
for el in elements:
href = el.get_attribute("href")
if href and re.search(r"bilibili\.com/video/BV\w+", href):
all_links.add(href)
return list(all_links)
# 🔍 改进版搜索函数:支持翻页
def search_and_collect(self, keyword, max_videos=300):
"""搜索关键词并抓取视频链接(支持翻页)"""
results = []
search_url = f"https://search.bilibili.com/all?keyword={keyword}&order=综合"
print(f"🔍 正在搜索:{search_url}")
self.driver.get(search_url)
time.sleep(self.wait)
page = 1
while len(results) < max_videos:
print(f"📄 第 {page} 页抓取中...")
new_links = self.auto_find_videos()
before = len(results)
for link in new_links:
if link not in results:
results.append(link)
if len(results) >= max_videos:
break
print(f"✅ 第{page}页:新增 {len(new_links)} 条,累计 {len(results)}")
# 尝试点击“下一页”按钮
try:
next_button = self.driver.find_element(By.CSS_SELECTOR, ".vui_pagenation--btn-next")
if "disabled" in next_button.get_attribute("class"):
print("⚠️ 已到最后一页,停止。")
break
self.driver.execute_script("arguments[0].click();", next_button)
time.sleep(self.wait + 1)
page += 1
except Exception as e:
print(f"⚠️ 翻页失败或已无下一页:{e}")
break
print(f"✅ 共找到 {len(results)} 个视频链接。")
return results[:max_videos]
def close(self):
"""关闭浏览器"""
try:
self.driver.quit()
except:
pass
if __name__ == "__main__":
c = VideoCrawler(headless=False)
links = c.search_and_collect("大语言模型", max_videos=300)
print("\n🎯 抓取到的视频链接:")
for link in links:
print(link)
c.close()