diff --git a/spider_main.py b/spider_main.py index 46af42b..85f970e 100644 --- a/spider_main.py +++ b/spider_main.py @@ -1,69 +1,69 @@ -import requests -import json -import re -import wordcloud -from bs4 import BeautifulSoup -from video_bid import videobid -# 基本模块 -# 所需视频bid号列表、弹幕内容、cid号列表 -list1 = [] -list2 = [] -anslist = [] -# 请求标头 模拟浏览器 -headers = { - #Cookie 用户信息 u-a 用户代理 referer 反爬 - "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0", - "cookie":"_uuid=C910E215C-7103E-4E54-9515-10FF7599AAECF78232infoc; buvid_fp=e7bba7e4bae311a280afa29479dcd19d; buvid3=A7EFF3D3-823F-E623-0EC8-4DE13D84710980740infoc; b_nut=1709977881; buvid4=B0B9DD4D-D469-44F5-6A8F-B2BCC72B405380740-024030909-YSaors5wcVdXxPAyYUcK7w%3D%3D; CURRENT_FNVAL=4048; rpdid=0zbfAGEiQM|8kXSwyAg|26E|3w1Swd2X; DedeUserID=440181187; DedeUserID__ckMd5=7aea6a85d9bac605; CURRENT_QUALITY=80; header_theme_version=CLOSE; enable_web_push=DISABLE; SESSDATA=218ebe14%2C1742048643%2C74a54%2A92CjCwT-WOtbM-xsZKbPnRuVAbDVtv--QmJvHh6khSys1se0CpcdYi5_2hO4THAprUtX0SVjMxenRnVjNiQ0JDd1ZtLVU1OWs3OUdOczNYWlZ6aWU2dkg2RHpWb2hZc00xOWJjWm9MVjdGNUcwLVY1ckYzOC1BUHViR0dvajZoeWpQb1M3ek9DUC13IIEC; bili_jct=51295d2894045195c66fdadf6b1b7a91; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY3NTU4OTQsImlhdCI6MTcyNjQ5NjYzNCwicGx0IjotMX0.Dr-RHjAMQ0qLpe3McjbIChH_d_S3UYkbXlb2doqMdWw; bili_ticket_expires=1726755834; home_feed_column=5; browser_resolution=1536-695; bp_t_offset_440181187=978115949112590336; sid=6xn7yn2o; b_lsid=BA647F49_1920109E5DD", - "referer":"https://www.bilibili.com/" -} -def video_bid(num1): - # 获取综合排序前300视频的bid - # 请求网址 - page_url = f"https://search.bilibili.com/all?keyword={"2024巴黎奥运会"}&page={num1}" - - response = requests.get(url=page_url, headers=headers)# 发送请求 - response.encoding = response.apparent_encoding - data_list = re.findall('"bvid":"(.*?)"', response.text)# 正则表达式获取bid - global list1 # 声明为全局变量 - list1.extend(data_list) # 每次循环导入数据 -def video_cid(num2): - # 获取视频cid - # 请求网址 - cid_url = f'https://api.bilibili.com/x/player/pagelist?bvid={num2}&jsonp=jsonp' - response = requests.get(url=cid_url, headers=headers) # 发送请求 - response.encoding = response.apparent_encoding - content = json.loads(response.text)# 获取cid号 - datalist = [] -# if "data" in content and "cid" in content["data"][0]: - cid = content["data"][0]["cid"] - if isinstance(cid, list): - datalist.extend(cid) - else: - datalist.append(cid) - global list2# 声明为全局变量 - list2.extend(datalist) -def video_bullet(num3): - # 爬取弹幕 - bullet_url = f'https://comment.bilibili.com/{num3}.xml' # 弹幕URL - response = requests.get(url=bullet_url, headers=headers) # 发送请求 - response.encoding = response.apparent_encoding - global anslist - chat_xml = BeautifulSoup(response.text,"xml") # 解析XML响应 提取文本 - ss = chat_xml.find_all("d") #爬取所有的弹幕 - for d in ss: - anslist.append(d.text) -if __name__ == '__main__': - # 每页30个视频 爬取第1页到第10页的bid号 - for page in (1,11): - video_bid(page) - # 对于每个bid号爬取cid号 - for bv in list1: - video_cid(bv) - # 对于每个CID号,爬取弹幕内容 - for cid in list2: - video_bullet(cid) - # 将弹幕内容导出到本地文件 - blscn = '\n'.join(anslist) - with open('所有弹幕.txt', mode='a', encoding='utf-8') as f: - f.write(blscn) +import requests +import json +import re +import wordcloud +from bs4 import BeautifulSoup +# from video_bid import videobid +# 基本模块 +# 所需视频bid号列表、弹幕内容、cid号列表 +list1 = [] +list2 = [] +anslist = [] +# 请求标头 模拟浏览器 +headers = { + #Cookie 用户信息 u-a 用户代理 referer 反爬 + "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0", + "cookie":"_uuid=C910E215C-7103E-4E54-9515-10FF7599AAECF78232infoc; buvid_fp=e7bba7e4bae311a280afa29479dcd19d; buvid3=A7EFF3D3-823F-E623-0EC8-4DE13D84710980740infoc; b_nut=1709977881; buvid4=B0B9DD4D-D469-44F5-6A8F-B2BCC72B405380740-024030909-YSaors5wcVdXxPAyYUcK7w%3D%3D; CURRENT_FNVAL=4048; rpdid=0zbfAGEiQM|8kXSwyAg|26E|3w1Swd2X; DedeUserID=440181187; DedeUserID__ckMd5=7aea6a85d9bac605; CURRENT_QUALITY=80; header_theme_version=CLOSE; enable_web_push=DISABLE; SESSDATA=218ebe14%2C1742048643%2C74a54%2A92CjCwT-WOtbM-xsZKbPnRuVAbDVtv--QmJvHh6khSys1se0CpcdYi5_2hO4THAprUtX0SVjMxenRnVjNiQ0JDd1ZtLVU1OWs3OUdOczNYWlZ6aWU2dkg2RHpWb2hZc00xOWJjWm9MVjdGNUcwLVY1ckYzOC1BUHViR0dvajZoeWpQb1M3ek9DUC13IIEC; bili_jct=51295d2894045195c66fdadf6b1b7a91; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY3NTU4OTQsImlhdCI6MTcyNjQ5NjYzNCwicGx0IjotMX0.Dr-RHjAMQ0qLpe3McjbIChH_d_S3UYkbXlb2doqMdWw; bili_ticket_expires=1726755834; home_feed_column=5; browser_resolution=1536-695; bp_t_offset_440181187=978115949112590336; sid=6xn7yn2o; b_lsid=BA647F49_1920109E5DD", + "referer":"https://www.bilibili.com/" +} +def video_bid(num1): + # 获取综合排序前300视频的bid + # 请求网址 + page_url = f"https://search.bilibili.com/all?keyword={"2024巴黎奥运会"}&page={num1}" + + response = requests.get(url=page_url, headers=headers)# 发送请求 + response.encoding = response.apparent_encoding + data_list = re.findall('"bvid":"(.*?)"', response.text)# 正则表达式获取bid + global list1 # 声明为全局变量 + list1.extend(data_list) # 每次循环导入数据 +def video_cid(num2): + # 获取视频cid + # 请求网址 + cid_url = f'https://api.bilibili.com/x/player/pagelist?bvid={num2}&jsonp=jsonp' + response = requests.get(url=cid_url, headers=headers) # 发送请求 + response.encoding = response.apparent_encoding + content = json.loads(response.text)# 获取cid号 + datalist = [] +# if "data" in content and "cid" in content["data"][0]: + cid = content["data"][0]["cid"] + if isinstance(cid, list): + datalist.extend(cid) + else: + datalist.append(cid) + global list2# 声明为全局变量 + list2.extend(datalist) +def video_bullet(num3): + # 爬取弹幕 + bullet_url = f'https://comment.bilibili.com/{num3}.xml' # 弹幕URL + response = requests.get(url=bullet_url, headers=headers) # 发送请求 + response.encoding = response.apparent_encoding + global anslist + chat_xml = BeautifulSoup(response.text,"xml") # 解析XML响应 提取文本 + ss = chat_xml.find_all("d") #爬取所有的弹幕 + for d in ss: + anslist.append(d.text) +if __name__ == '__main__': + # 每页30个视频 爬取第1页到第10页的bid号 + for page in (1,11): + video_bid(page) + # 对于每个bid号爬取cid号 + for bv in list1: + video_cid(bv) + # 对于每个CID号,爬取弹幕内容 + for cid in list2: + video_bullet(cid) + # 将弹幕内容导出到本地文件 + blscn = '\n'.join(anslist) + with open('所有弹幕.txt', mode='a', encoding='utf-8') as f: + f.write(blscn) print("endprint") #检测是否正常退出 \ No newline at end of file