You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
56 lines
3.7 KiB
56 lines
3.7 KiB
import requests
|
|
import re
|
|
import time
|
|
import pandas as pd
|
|
from collections import Counter
|
|
import wordcloud
|
|
import matplotlib.pyplot as plt
|
|
video_num=300
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36',
|
|
"cookie":"buvid3=54DE8606-3021-2FB6-4872-357B0704095E16527infoc; b_nut=1726194016; _uuid=E15C76EA-7673-A719-181C-B310137107ECD121742infoc; enable_web_push=DISABLE; home_feed_column=5; buvid_fp=15814142e80dfa9c068eed7a71851bf5; buvid4=76281DE5-AB09-6D18-BA67-2665764B23E418108-024091302-QuJvEpYDn4lWADFDZJ3oVg%3D%3D; SESSDATA=90f76878%2C1742031396%2C370e0%2A92CjBC5iP1hhGyQ1Xw0rdbO9xgMM2_MTiR33GjObW1Q6tORSBoVnJm05JChaZAeeHOpRgSVmRUZlVSbVFGWmhRWDA2NE9nWUFyNENOT1BDYS1RSkVEOVVka3R4dXh4R1FDTE1KTDJFQndDNVlsVGFla3RLY0NZQ2pET1BqMEw1MkloMmRLZU9XR2xRIIEC; bili_jct=b1ef05e044aee6835cda207b0139fa50; DedeUserID=34740935; DedeUserID__ckMd5=071ab34a61265a21; CURRENT_FNVAL=4048; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY3Mzg2MDksImlhdCI6MTcyNjQ3OTM0OSwicGx0IjotMX0.cutEpP5MEoNMCXJT4-E1j4YcTZyj_DjFnqKO6Fcn1n8; bili_ticket_expires=1726738549; rpdid=|(RYkm|Yul)0J'u~kYR~mRuk; bp_t_offset_34740935=978030977479606272; header_theme_version=CLOSE; browser_resolution=1699-941; b_lsid=CD3ABBDF_191FEA3B82D"
|
|
}#请求头
|
|
file=open('bvid.txt','w')#创建存放视频bvid值的文件
|
|
#爬取视频的bvid号
|
|
for page0 in range(1,11): #找到视频的bvid号
|
|
if page0==1:#判断格式
|
|
url = "https://search.bilibili.com/all?vt=60711865&keyword=20224%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=3"
|
|
else:
|
|
url = f"https://search.bilibili.com/all?vt=60711865&keyword=20224%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=3&page={page0}"
|
|
response = requests.get(url=url, headers=headers) # 对url发送请求
|
|
data_list = re.findall('bvid:"(.*?)",title:',response.text)#匹配bvid号
|
|
for index in data_list:
|
|
with open('bvid.txt', mode='a+', encoding='utf-8') as f:
|
|
if index not in f.read(): # 防止重复的bvid号写入
|
|
f.write(index) # 写入text文件
|
|
f.write('\n')
|
|
#定义函数
|
|
def get_danmu(cid):
|
|
url = f'https://api.bilibili.com/x/v1/dm/list.so?oid={cid}' # 通过cid值获取对应视频的弹幕
|
|
headers1 = {
|
|
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0"
|
|
}
|
|
resp = requests.get(url, headers=headers1) # 从网址获取弹幕
|
|
resp.encoding = "utf-8" # 弹幕中文编码
|
|
Data = resp.text # 提取text文本
|
|
context = re.findall('<d p=".*?">(.*?)</d>', Data) # 提取弹幕文本
|
|
print(context)
|
|
for index in context:
|
|
with open('爬取的总弹幕.txt', mode='a', encoding='utf-8') as f:
|
|
f.write(index) # 写入text文件
|
|
f.write('\n')
|
|
# 获取oid值
|
|
f1=open('bvid.txt',mode='r',encoding='utf-8')#打开bvid文本
|
|
bvid_text=f1.read().splitlines()
|
|
count=0#计数来获取前三百个视频
|
|
for bvid1 in bvid_text:
|
|
count=count+1
|
|
url=f"https://www.bilibili.com/video/{bvid1}/?spm_id_from=333.337.search-card.all.click&vd_source=516714ff716c382225c801afa2c87d8d"
|
|
res0=requests.get(url,headers=headers)#获取视频数据
|
|
Text0=res0.text
|
|
oid_list=re.findall('"embedPlayer":{"p":.*?,"aid":.*?,"bvid":".*?","cid":(.*?),',Text0)#获取oid值(多余空格去除,不然匹配不到)
|
|
for oid1 in oid_list:#提取oid值
|
|
get_danmu(oid1)#调用获取弹幕的函数
|
|
if count >=video_num:
|
|
break
|