You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
100 lines
3.5 KiB
100 lines
3.5 KiB
2 months ago
|
import requests
|
||
|
import re
|
||
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||
|
|
||
|
count = 0
|
||
|
|
||
|
|
||
|
def get_page_url(n):
|
||
|
"""
|
||
|
获取页面的URL
|
||
|
"""
|
||
|
page_url_list = []
|
||
|
for i in range(n):
|
||
|
if i == 0:
|
||
|
page_url = "https://search.bilibili.com/all?keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=5"
|
||
|
else:
|
||
|
page_url = f"https://search.bilibili.com/all?keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=5&page={i + 1}&o={i * 36}"
|
||
|
page_url_list.append(page_url)
|
||
|
return page_url_list
|
||
|
|
||
|
|
||
|
header = {
|
||
|
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0"}
|
||
|
|
||
|
|
||
|
def fetch_page(url):
|
||
|
try:
|
||
|
response = requests.get(url=url, headers=header)
|
||
|
response.encoding = 'utf-8'
|
||
|
return response.text
|
||
|
except Exception as e:
|
||
|
print(f"请求失败: {e}")
|
||
|
return ""
|
||
|
|
||
|
|
||
|
def get_cid(page_url_list):
|
||
|
global count
|
||
|
cid_list = []
|
||
|
|
||
|
with ThreadPoolExecutor(max_workers=5) as executor:
|
||
|
future_to_url = {executor.submit(fetch_page, url): url for url in page_url_list}
|
||
|
for future in as_completed(future_to_url):
|
||
|
page_url = future_to_url[future]
|
||
|
try:
|
||
|
data = future.result()
|
||
|
content = re.findall('"//www.bilibili.com/video/(.*?)/"', data)
|
||
|
content = set(content)
|
||
|
content = list(content)
|
||
|
for bvid in content:
|
||
|
url = f"https://api.bilibili.com/x/player/pagelist?bvid={bvid}&jsonp=jsonp"
|
||
|
response = fetch_page(url)
|
||
|
cids = re.findall('{"cid":(.*?),', response)
|
||
|
if cids:
|
||
|
cid_list.append(cids[0])
|
||
|
count += 1
|
||
|
print(f"已获取到 {count} 个cid")
|
||
|
if count >= 300:
|
||
|
break
|
||
|
if count >= 300:
|
||
|
break
|
||
|
except Exception as e:
|
||
|
print(f"处理失败: {e}")
|
||
|
return cid_list
|
||
|
|
||
|
|
||
|
def get_danmu(cid_list):
|
||
|
def fetch_danmu(cid):
|
||
|
try:
|
||
|
url = f"https://comment.bilibili.com/{cid}.xml"
|
||
|
response = requests.get(url=url, headers=header)
|
||
|
response.encoding = 'utf-8'
|
||
|
data = response.text
|
||
|
return re.findall('<d p=".*?">(.*?)</d>', data)
|
||
|
except Exception as e:
|
||
|
print(f"请求失败: {e}")
|
||
|
return []
|
||
|
|
||
|
danmu_list = []
|
||
|
with ThreadPoolExecutor(max_workers=5) as executor:
|
||
|
future_to_cid = {executor.submit(fetch_danmu, cid): cid for cid in cid_list}
|
||
|
for future in as_completed(future_to_cid):
|
||
|
cid = future_to_cid[future]
|
||
|
try:
|
||
|
danmu_list.extend(future.result())
|
||
|
print(f"已获取到 {len(danmu_list)} 条弹幕")
|
||
|
except Exception as e:
|
||
|
print(f"处理失败: {e}")
|
||
|
return danmu_list
|
||
|
|
||
|
|
||
|
cid_list = get_cid(get_page_url(10))
|
||
|
print("开始获取弹幕数据...")
|
||
|
danmu_list = get_danmu(cid_list)
|
||
|
print("弹幕数据爬取完成。")
|
||
|
with open('弹幕.txt', 'w', encoding='utf-8') as f:
|
||
|
for danmu in danmu_list:
|
||
|
f.write(danmu + '\n')
|
||
|
|
||
|
print("弹幕已保存到 '弹幕.txt'")
|