You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
91 lines
3.4 KiB
91 lines
3.4 KiB
# src/crawler.py
|
|
from selenium import webdriver
|
|
from selenium.webdriver.common.by import By
|
|
from selenium.webdriver.edge.service import Service
|
|
from selenium.webdriver.edge.options import Options
|
|
import time
|
|
import re
|
|
|
|
|
|
class VideoCrawler:
|
|
def __init__(self, driver_path=r"D:\l1\drivers\msedgedriver.exe", headless=True, wait=2):
|
|
"""初始化 Edge 驱动"""
|
|
options = Options()
|
|
if headless:
|
|
options.add_argument("--headless=new")
|
|
options.add_argument("--disable-gpu")
|
|
options.add_argument("--disable-software-rasterizer")
|
|
options.add_argument("--no-sandbox")
|
|
options.add_argument("--disable-web-security")
|
|
options.add_argument("--ignore-certificate-errors")
|
|
options.add_argument("--no-proxy-server")
|
|
options.add_argument("--log-level=3")
|
|
service = Service(driver_path)
|
|
self.driver = webdriver.Edge(service=service, options=options)
|
|
self.wait = wait
|
|
|
|
# 🧠 自动识别视频链接函数
|
|
def auto_find_videos(self):
|
|
"""自动从当前页面识别出视频链接"""
|
|
all_links = set()
|
|
elements = self.driver.find_elements(By.TAG_NAME, "a")
|
|
for el in elements:
|
|
href = el.get_attribute("href")
|
|
if href and re.search(r"bilibili\.com/video/BV\w+", href):
|
|
all_links.add(href)
|
|
return list(all_links)
|
|
|
|
# 🔍 改进版搜索函数:支持翻页
|
|
def search_and_collect(self, keyword, max_videos=300):
|
|
"""搜索关键词并抓取视频链接(支持翻页)"""
|
|
results = []
|
|
search_url = f"https://search.bilibili.com/all?keyword={keyword}&order=综合"
|
|
print(f"🔍 正在搜索:{search_url}")
|
|
self.driver.get(search_url)
|
|
time.sleep(self.wait)
|
|
|
|
page = 1
|
|
while len(results) < max_videos:
|
|
print(f"📄 第 {page} 页抓取中...")
|
|
new_links = self.auto_find_videos()
|
|
before = len(results)
|
|
for link in new_links:
|
|
if link not in results:
|
|
results.append(link)
|
|
if len(results) >= max_videos:
|
|
break
|
|
|
|
print(f"✅ 第{page}页:新增 {len(new_links)} 条,累计 {len(results)} 条")
|
|
|
|
# 尝试点击“下一页”按钮
|
|
try:
|
|
next_button = self.driver.find_element(By.CSS_SELECTOR, ".vui_pagenation--btn-next")
|
|
if "disabled" in next_button.get_attribute("class"):
|
|
print("⚠️ 已到最后一页,停止。")
|
|
break
|
|
self.driver.execute_script("arguments[0].click();", next_button)
|
|
time.sleep(self.wait + 1)
|
|
page += 1
|
|
except Exception as e:
|
|
print(f"⚠️ 翻页失败或已无下一页:{e}")
|
|
break
|
|
|
|
print(f"✅ 共找到 {len(results)} 个视频链接。")
|
|
return results[:max_videos]
|
|
|
|
def close(self):
|
|
"""关闭浏览器"""
|
|
try:
|
|
self.driver.quit()
|
|
except:
|
|
pass
|
|
|
|
|
|
if __name__ == "__main__":
|
|
c = VideoCrawler(headless=False)
|
|
links = c.search_and_collect("大语言模型", max_videos=300)
|
|
print("\n🎯 抓取到的视频链接:")
|
|
for link in links:
|
|
print(link)
|
|
c.close()
|