diff --git a/爬虫弹幕.py b/爬虫弹幕.py new file mode 100644 index 0000000..0d09764 --- /dev/null +++ b/爬虫弹幕.py @@ -0,0 +1,55 @@ +import requests +import re +import time +import pandas as pd +from collections import Counter +import wordcloud +import matplotlib.pyplot as plt +video_num=300 +headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36', + "cookie":"buvid3=54DE8606-3021-2FB6-4872-357B0704095E16527infoc; b_nut=1726194016; _uuid=E15C76EA-7673-A719-181C-B310137107ECD121742infoc; enable_web_push=DISABLE; home_feed_column=5; buvid_fp=15814142e80dfa9c068eed7a71851bf5; buvid4=76281DE5-AB09-6D18-BA67-2665764B23E418108-024091302-QuJvEpYDn4lWADFDZJ3oVg%3D%3D; SESSDATA=90f76878%2C1742031396%2C370e0%2A92CjBC5iP1hhGyQ1Xw0rdbO9xgMM2_MTiR33GjObW1Q6tORSBoVnJm05JChaZAeeHOpRgSVmRUZlVSbVFGWmhRWDA2NE9nWUFyNENOT1BDYS1RSkVEOVVka3R4dXh4R1FDTE1KTDJFQndDNVlsVGFla3RLY0NZQ2pET1BqMEw1MkloMmRLZU9XR2xRIIEC; bili_jct=b1ef05e044aee6835cda207b0139fa50; DedeUserID=34740935; DedeUserID__ckMd5=071ab34a61265a21; CURRENT_FNVAL=4048; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY3Mzg2MDksImlhdCI6MTcyNjQ3OTM0OSwicGx0IjotMX0.cutEpP5MEoNMCXJT4-E1j4YcTZyj_DjFnqKO6Fcn1n8; bili_ticket_expires=1726738549; rpdid=|(RYkm|Yul)0J'u~kYR~mRuk; bp_t_offset_34740935=978030977479606272; header_theme_version=CLOSE; browser_resolution=1699-941; b_lsid=CD3ABBDF_191FEA3B82D" +}#请求头 +file=open('bvid.txt','w')#创建存放视频bvid值的文件 +#爬取视频的bvid号 +for page0 in range(1,11): #找到视频的bvid号 + if page0==1:#判断格式 + url = "https://search.bilibili.com/all?vt=60711865&keyword=20224%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=3" + else: + url = f"https://search.bilibili.com/all?vt=60711865&keyword=20224%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=3&page={page0}" + response = requests.get(url=url, headers=headers) # 对url发送请求 + data_list = re.findall('bvid:"(.*?)",title:',response.text)#匹配bvid号 + for index in data_list: + with open('bvid.txt', mode='a+', encoding='utf-8') as f: + if index not in f.read(): # 防止重复的bvid号写入 + f.write(index) # 写入text文件 + f.write('\n') +#定义函数 +def get_danmu(cid): + url = f'https://api.bilibili.com/x/v1/dm/list.so?oid={cid}' # 通过cid值获取对应视频的弹幕 + headers1 = { + "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0" + } + resp = requests.get(url, headers=headers1) # 从网址获取弹幕 + resp.encoding = "utf-8" # 弹幕中文编码 + Data = resp.text # 提取text文本 + context = re.findall('(.*?)', Data) # 提取弹幕文本 + print(context) + for index in context: + with open('爬取的总弹幕.txt', mode='a', encoding='utf-8') as f: + f.write(index) # 写入text文件 + f.write('\n') +# 获取oid值 +f1=open('bvid.txt',mode='r',encoding='utf-8')#打开bvid文本 +bvid_text=f1.read().splitlines() +count=0#计数来获取前三百个视频 +for bvid1 in bvid_text: + count=count+1 + url=f"https://www.bilibili.com/video/{bvid1}/?spm_id_from=333.337.search-card.all.click&vd_source=516714ff716c382225c801afa2c87d8d" + res0=requests.get(url,headers=headers)#获取视频数据 + Text0=res0.text + oid_list=re.findall('"embedPlayer":{"p":.*?,"aid":.*?,"bvid":".*?","cid":(.*?),',Text0)#获取oid值(多余空格去除,不然匹配不到) + for oid1 in oid_list:#提取oid值 + get_danmu(oid1)#调用获取弹幕的函数 + if count >=video_num: + break