You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
spider/spider_main.py

68 lines
3.5 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import requests
import json
import re
import wordcloud
from bs4 import BeautifulSoup
# from video_bid import videobid
# 基本模块
# 所需视频bid号列表、弹幕内容、cid号列表
list1 = []
list2 = []
anslist = []
# 请求标头 模拟浏览器
headers = {
#Cookie 用户信息 u-a 用户代理 referer 反爬
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0",
"cookie":"_uuid=C910E215C-7103E-4E54-9515-10FF7599AAECF78232infoc; buvid_fp=e7bba7e4bae311a280afa29479dcd19d; buvid3=A7EFF3D3-823F-E623-0EC8-4DE13D84710980740infoc; b_nut=1709977881; buvid4=B0B9DD4D-D469-44F5-6A8F-B2BCC72B405380740-024030909-YSaors5wcVdXxPAyYUcK7w%3D%3D; CURRENT_FNVAL=4048; rpdid=0zbfAGEiQM|8kXSwyAg|26E|3w1Swd2X; DedeUserID=440181187; DedeUserID__ckMd5=7aea6a85d9bac605; CURRENT_QUALITY=80; header_theme_version=CLOSE; enable_web_push=DISABLE; SESSDATA=218ebe14%2C1742048643%2C74a54%2A92CjCwT-WOtbM-xsZKbPnRuVAbDVtv--QmJvHh6khSys1se0CpcdYi5_2hO4THAprUtX0SVjMxenRnVjNiQ0JDd1ZtLVU1OWs3OUdOczNYWlZ6aWU2dkg2RHpWb2hZc00xOWJjWm9MVjdGNUcwLVY1ckYzOC1BUHViR0dvajZoeWpQb1M3ek9DUC13IIEC; bili_jct=51295d2894045195c66fdadf6b1b7a91; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY3NTU4OTQsImlhdCI6MTcyNjQ5NjYzNCwicGx0IjotMX0.Dr-RHjAMQ0qLpe3McjbIChH_d_S3UYkbXlb2doqMdWw; bili_ticket_expires=1726755834; home_feed_column=5; browser_resolution=1536-695; bp_t_offset_440181187=978115949112590336; sid=6xn7yn2o; b_lsid=BA647F49_1920109E5DD",
"referer":"https://www.bilibili.com/"
}
def video_bid(num1):
# 获取综合排序前300视频的bid
# 请求网址
page_url = f"https://search.bilibili.com/all?keyword={"2024巴黎奥运会"}&page={num1}"
response = requests.get(url=page_url, headers=headers)# 发送请求
response.encoding = response.apparent_encoding
data_list = re.findall('"bvid":"(.*?)"', response.text)# 正则表达式获取bid
global list1 # 声明为全局变量
list1.extend(data_list) # 每次循环导入数据
def video_cid(num2):
# 获取视频cid
# 请求网址
cid_url = f'https://api.bilibili.com/x/player/pagelist?bvid={num2}&jsonp=jsonp'
response = requests.get(url=cid_url, headers=headers) # 发送请求
response.encoding = response.apparent_encoding
content = json.loads(response.text)# 获取cid号
datalist = []
cid = content["data"][0]["cid"]
if isinstance(cid, list):
datalist.extend(cid)
else:
datalist.append(cid)
global list2# 声明为全局变量
list2.extend(datalist)
def video_bullet(num3):
# 爬取弹幕
bullet_url = f'https://comment.bilibili.com/{num3}.xml' # 弹幕URL
response = requests.get(url=bullet_url, headers=headers) # 发送请求
response.encoding = response.apparent_encoding
global anslist
chat_xml = BeautifulSoup(response.text,"xml") # 解析XML响应 提取文本
ss = chat_xml.find_all("d") #爬取所有的弹幕
for d in ss:
anslist.append(d.text)
if __name__ == '__main__':
# 每页30个视频 爬取第1页到第10页的bid号
for page in (1,11):
video_bid(page)
# 对于每个bid号爬取cid号
for bv in list1:
video_cid(bv)
# 对于每个CID号爬取弹幕内容
for cid in list2:
video_bullet(cid)
# 将弹幕内容导出到本地文件
blscn = '\n'.join(anslist)
with open('所有弹幕.txt', mode='a', encoding='utf-8') as f:
f.write(blscn)
print("endprint") #检测是否正常退出