From 9e7d1e7360216c644491953ad4974bb9b0468aa9 Mon Sep 17 00:00:00 2001
From: pjmw9izve <2308014474@qq.com>
Date: Tue, 17 Sep 2024 12:41:56 +0800
Subject: [PATCH] ADD file via upload

---
 get_300urls.py | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)
 create mode 100644 get_300urls.py
diff --git a/get_300urls.py b/get_300urls.py
new file mode 100644
index 0000000..388a2c5
--- /dev/null
+++ b/get_300urls.py
@@ -0,0 +1,54 @@
+import requests
+from bs4 import BeautifulSoup
+import time
+
+def get_300videos_urls(keyword):
+
+#    从 B 站搜索结果中获取最多 300 个视频的 URL。
+    page = 1
+    urls = []  # 存储视频链接的列表
+
+    while len(urls) < 300:
+        url = f"https://search.bilibili.com/video?keyword={keyword}&page={page}"
+        headers = {
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0"
+        }
+
+        try:
+            response = requests.get(url, headers=headers)
+            response.raise_for_status()
+            time.sleep(0.5)  # 避免过于频繁的请求
+
+            soup = BeautifulSoup(response.text, 'html.parser')
+            
+            # 查找所有包含视频链接的 <a> 标签
+            for item in soup.find_all('a', href=True):
+                href = item['href']
+                
+                # 处理以 // 开头的 URL
+                if href.startswith('//'):
+                    href = 'https:' + href
+                
+                # 只保留以 /video/ 开头的链接
+                if '/video/' in href and href not in urls:
+                    urls.append(href)
+                
+                # 如果已获取 300 个链接，则退出循环
+                if len(urls) >= 300:
+                    break
+            
+            # 如果已获取 300 个链接，则退出循环
+            if len(urls) >= 300:
+                break
+
+            page += 1  # 继续请求下一页
+
+        except requests.RequestException as e:
+            print(f"Error fetching URL from page {page}: {e}")
+
+    if not urls:
+        print("Failed to retrieve any URLs.")
+    else:
+        print(f"Retrieved {len(urls)} URLs.")
+
+    return urls