You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

56 lines
3.7 KiB

import requests
import re
import time
import pandas as pd
from collections import Counter
import wordcloud
import matplotlib.pyplot as plt
video_num=300
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36',
"cookie":"buvid3=54DE8606-3021-2FB6-4872-357B0704095E16527infoc; b_nut=1726194016; _uuid=E15C76EA-7673-A719-181C-B310137107ECD121742infoc; enable_web_push=DISABLE; home_feed_column=5; buvid_fp=15814142e80dfa9c068eed7a71851bf5; buvid4=76281DE5-AB09-6D18-BA67-2665764B23E418108-024091302-QuJvEpYDn4lWADFDZJ3oVg%3D%3D; SESSDATA=90f76878%2C1742031396%2C370e0%2A92CjBC5iP1hhGyQ1Xw0rdbO9xgMM2_MTiR33GjObW1Q6tORSBoVnJm05JChaZAeeHOpRgSVmRUZlVSbVFGWmhRWDA2NE9nWUFyNENOT1BDYS1RSkVEOVVka3R4dXh4R1FDTE1KTDJFQndDNVlsVGFla3RLY0NZQ2pET1BqMEw1MkloMmRLZU9XR2xRIIEC; bili_jct=b1ef05e044aee6835cda207b0139fa50; DedeUserID=34740935; DedeUserID__ckMd5=071ab34a61265a21; CURRENT_FNVAL=4048; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY3Mzg2MDksImlhdCI6MTcyNjQ3OTM0OSwicGx0IjotMX0.cutEpP5MEoNMCXJT4-E1j4YcTZyj_DjFnqKO6Fcn1n8; bili_ticket_expires=1726738549; rpdid=|(RYkm|Yul)0J'u~kYR~mRuk; bp_t_offset_34740935=978030977479606272; header_theme_version=CLOSE; browser_resolution=1699-941; b_lsid=CD3ABBDF_191FEA3B82D"
}#请求头
file=open('bvid.txt','w')#创建存放视频bvid值的文件
#爬取视频的bvid号
for page0 in range(1,11): #找到视频的bvid号
if page0==1:#判断格式
url = "https://search.bilibili.com/all?vt=60711865&keyword=20224%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=3"
else:
url = f"https://search.bilibili.com/all?vt=60711865&keyword=20224%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=3&page={page0}"
response = requests.get(url=url, headers=headers) # 对url发送请求
data_list = re.findall('bvid:"(.*?)",title:',response.text)#匹配bvid号
for index in data_list:
with open('bvid.txt', mode='a+', encoding='utf-8') as f:
if index not in f.read(): # 防止重复的bvid号写入
f.write(index) # 写入text文件
f.write('\n')
#定义函数
def get_danmu(cid):
url = f'https://api.bilibili.com/x/v1/dm/list.so?oid={cid}' # 通过cid值获取对应视频的弹幕
headers1 = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0"
}
resp = requests.get(url, headers=headers1) # 从网址获取弹幕
resp.encoding = "utf-8" # 弹幕中文编码
Data = resp.text # 提取text文本
context = re.findall('<d p=".*?">(.*?)</d>', Data) # 提取弹幕文本
print(context)
for index in context:
with open('爬取的总弹幕.txt', mode='a', encoding='utf-8') as f:
f.write(index) # 写入text文件
f.write('\n')
# 获取oid值
f1=open('bvid.txt',mode='r',encoding='utf-8')#打开bvid文本
bvid_text=f1.read().splitlines()
count=0#计数来获取前三百个视频
for bvid1 in bvid_text:
count=count+1
url=f"https://www.bilibili.com/video/{bvid1}/?spm_id_from=333.337.search-card.all.click&vd_source=516714ff716c382225c801afa2c87d8d"
res0=requests.get(url,headers=headers)#获取视频数据
Text0=res0.text
oid_list=re.findall('"embedPlayer":{"p":.*?,"aid":.*?,"bvid":".*?","cid":(.*?),',Text0)#获取oid值(多余空格去除,不然匹配不到)
for oid1 in oid_list:#提取oid值
get_danmu(oid1)#调用获取弹幕的函数
if count >=video_num:
break