import requests import re import time import pandas as pd from collections import Counter import wordcloud import matplotlib.pyplot as plt video_num=300 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36', "cookie":"buvid3=54DE8606-3021-2FB6-4872-357B0704095E16527infoc; b_nut=1726194016; _uuid=E15C76EA-7673-A719-181C-B310137107ECD121742infoc; enable_web_push=DISABLE; home_feed_column=5; buvid_fp=15814142e80dfa9c068eed7a71851bf5; buvid4=76281DE5-AB09-6D18-BA67-2665764B23E418108-024091302-QuJvEpYDn4lWADFDZJ3oVg%3D%3D; SESSDATA=90f76878%2C1742031396%2C370e0%2A92CjBC5iP1hhGyQ1Xw0rdbO9xgMM2_MTiR33GjObW1Q6tORSBoVnJm05JChaZAeeHOpRgSVmRUZlVSbVFGWmhRWDA2NE9nWUFyNENOT1BDYS1RSkVEOVVka3R4dXh4R1FDTE1KTDJFQndDNVlsVGFla3RLY0NZQ2pET1BqMEw1MkloMmRLZU9XR2xRIIEC; bili_jct=b1ef05e044aee6835cda207b0139fa50; DedeUserID=34740935; DedeUserID__ckMd5=071ab34a61265a21; CURRENT_FNVAL=4048; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY3Mzg2MDksImlhdCI6MTcyNjQ3OTM0OSwicGx0IjotMX0.cutEpP5MEoNMCXJT4-E1j4YcTZyj_DjFnqKO6Fcn1n8; bili_ticket_expires=1726738549; rpdid=|(RYkm|Yul)0J'u~kYR~mRuk; bp_t_offset_34740935=978030977479606272; header_theme_version=CLOSE; browser_resolution=1699-941; b_lsid=CD3ABBDF_191FEA3B82D" }#请求头 file=open('bvid.txt','w')#创建存放视频bvid值的文件 #爬取视频的bvid号 for page0 in range(1,11): #找到视频的bvid号 if page0==1:#判断格式 url = "https://search.bilibili.com/all?vt=60711865&keyword=20224%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=3" else: url = f"https://search.bilibili.com/all?vt=60711865&keyword=20224%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=3&page={page0}" response = requests.get(url=url, headers=headers) # 对url发送请求 data_list = re.findall('bvid:"(.*?)",title:',response.text)#匹配bvid号 for index in data_list: with open('bvid.txt', mode='a+', encoding='utf-8') as f: if index not in f.read(): # 防止重复的bvid号写入 f.write(index) # 写入text文件 f.write('\n') #定义函数 def get_danmu(cid): url = f'https://api.bilibili.com/x/v1/dm/list.so?oid={cid}' # 通过cid值获取对应视频的弹幕 headers1 = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0" } resp = requests.get(url, headers=headers1) # 从网址获取弹幕 resp.encoding = "utf-8" # 弹幕中文编码 Data = resp.text # 提取text文本 context = re.findall('(.*?)', Data) # 提取弹幕文本 print(context) for index in context: with open('爬取的总弹幕.txt', mode='a', encoding='utf-8') as f: f.write(index) # 写入text文件 f.write('\n') # 获取oid值 f1=open('bvid.txt',mode='r',encoding='utf-8')#打开bvid文本 bvid_text=f1.read().splitlines() count=0#计数来获取前三百个视频 for bvid1 in bvid_text: count=count+1 url=f"https://www.bilibili.com/video/{bvid1}/?spm_id_from=333.337.search-card.all.click&vd_source=516714ff716c382225c801afa2c87d8d" res0=requests.get(url,headers=headers)#获取视频数据 Text0=res0.text oid_list=re.findall('"embedPlayer":{"p":.*?,"aid":.*?,"bvid":".*?","cid":(.*?),',Text0)#获取oid值(多余空格去除,不然匹配不到) for oid1 in oid_list:#提取oid值 get_danmu(oid1)#调用获取弹幕的函数 if count >=video_num: break