You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
120 lines
4.9 KiB
120 lines
4.9 KiB
import time
|
|
import requests
|
|
import xml.etree.ElementTree as ET
|
|
import re
|
|
import wordcloud
|
|
import pandas as pd
|
|
from collections import Counter
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
|
|
#获取视频bv号
|
|
def get_bvid(headers):
|
|
bv_list = [] #目的bv列表
|
|
url = 'https://search.bilibili.com/video?vt=83711075&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=3'
|
|
for page in range(1, 11):
|
|
page = str(page)
|
|
param ={
|
|
'spm_id_from': '333.1007',
|
|
'search_source': '3',
|
|
'page': page,
|
|
'o': ''
|
|
}
|
|
|
|
bi_text = requests.get(url=url,headers=headers,params=param).text
|
|
# 正则选出bv号
|
|
r = re.findall('<div.*?<a href="//www.bilibili.com/video/(.*?)/".*?</div></a>',bi_text)
|
|
r_list = list(set(r))
|
|
for index in r_list:
|
|
bv_list.append(index)
|
|
time.sleep(1)
|
|
bv_list = list(set(bv_list))
|
|
return bv_list
|
|
|
|
# 获取视频的 cid
|
|
def get_cid(bv_id, headers):
|
|
url = f"https://api.bilibili.com/x/web-interface/view?bvid={bv_id}"
|
|
response = requests.get(url,headers=headers)
|
|
data = response.json()
|
|
return data['data']['cid']
|
|
|
|
# 获取弹幕 XML
|
|
def get_danmu(cid, headers):
|
|
url = f"https://comment.bilibili.com/{cid}.xml"
|
|
response = requests.get(url, headers=headers)
|
|
if response.status_code == 200:
|
|
return response.content
|
|
else:
|
|
print(f"弹幕请求失败,状态码: {response.status_code}")
|
|
return None
|
|
|
|
# 解析 XML 并返回弹幕内容
|
|
def parse_danmu(xml_content):
|
|
root = ET.fromstring(xml_content)
|
|
danmu_list = [d.text for d in root.findall('d')]
|
|
pattern = re.compile(r'([^a-z?]AI[^a-z?])',re.I)
|
|
danmu_ai_list = [ai for ai in danmu_list if pattern.search(ai)]
|
|
return danmu_ai_list
|
|
# 多线程处理弹幕爬取
|
|
def fetch_danmu(bv_id, headers):
|
|
cid = get_cid(bv_id, headers)
|
|
if cid:
|
|
xml_content = get_danmu(cid,headers)
|
|
if xml_content:
|
|
return parse_danmu(xml_content)
|
|
return []
|
|
|
|
|
|
# 多线程爬取多个视频的弹幕
|
|
def fetch_all_danmus(bv_list, headers):
|
|
all_danmu = []
|
|
|
|
# 使用多线程池
|
|
with ThreadPoolExecutor(max_workers=5) as executor:
|
|
future_to_bv_id = {executor.submit(fetch_danmu, bv_id,headers): bv_id for bv_id in bv_list}
|
|
|
|
for future in as_completed(future_to_bv_id):
|
|
bv_id = future_to_bv_id[future]
|
|
try:
|
|
danmus = future.result()
|
|
if danmus:
|
|
all_danmu.extend(danmus)
|
|
except Exception as exc:
|
|
print(f"{bv_id} 弹幕爬取时发生异常: {exc}")
|
|
|
|
return all_danmu
|
|
|
|
#统计弹幕数量 导出为excel表格 并输出数量前8名的弹幕
|
|
def top_danmu(danmu_list):
|
|
counter = Counter(danmu_list)
|
|
#导出Excel
|
|
df = pd.DataFrame(list(counter.items()), columns=['Danmu', 'Count'])
|
|
df.to_excel('danmu_ai_count.xlsx',index=False)
|
|
#输出前8名
|
|
top_danmus = counter.most_common(8)
|
|
for danmu, count in top_danmus:
|
|
print(f"{danmu}: {count}")
|
|
|
|
#根据弹幕输出词云图
|
|
def wordcloud_danmu(danmu_list):
|
|
danmu_string = ''.join(danmu_list)
|
|
wc = wordcloud.WordCloud(
|
|
height=300,
|
|
width=500,
|
|
background_color='white',
|
|
font_path='msyh.ttc',
|
|
scale=15
|
|
)
|
|
wc.generate(danmu_string)
|
|
wc.to_file('1.png')
|
|
|
|
#主函数
|
|
headers = {
|
|
'Cookie': 'buvid_fp=bf8beba45de9821b7ea7e50612d09908; LIVE_BUVID=AUTO4116559027162689; CURRENT_FNVAL=4048; buvid3=973F36A7-32C2-B0E4-8ACE-5E0B758E2FF782902infoc; b_nut=1701966882; _uuid=CC9ED47C-31078-19D2-10595-CD6B1057A2631005192infoc; buvid4=CE632AC2-E7EE-2F09-30E0-9BF3F5C2D40878774-024042908-KwURbcfX8EBz810RKJAgEw%3D%3D; rpdid=|(u))kkYu|)m0J\'u~uRJ~)J~m; header_theme_version=CLOSE; enable_web_push=DISABLE; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY3NjQ0NTksImlhdCI6MTcyNjUwNTE5OSwicGx0IjotMX0.hMTE7drIcEEwzsI8zYxYXI1o7zzeK2ggwuLaW7EbN2M; bili_ticket_expires=1726764399; bp_t_offset_593983186=978039902421647360; SESSDATA=1a5c072c%2C1742137341%2C9ab0d%2A92CjC2gzBMHSu21uOWgbHRFRswIo1CHBMpBvihPj-9d6qWyU-A5dWmJF_QPAEJPwyIBJ0SVm13Uzk1bENFZGR2bk9nR3ZtaDNmQXJTeE8xVDR5RFZoRkIxS3NmYU9kc01Sb0VKZ0hnMTRjbEhLVHQ3REdqdDVYNm5LLWhYOGp1RE42QjhPS1Ntb3B3IIEC; bili_jct=369030ec2cf370e475f8d79829b8f725; DedeUserID=593983186; DedeUserID__ckMd5=48e87375187f5f74; sid=8vio5fpf; b_lsid=7102AB72D_1920545EF21; bsource=search_bing; bmg_af_switch=1; bmg_src_def_domain=i1.hdslb.com; home_feed_column=4; browser_resolution=659-994',
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0'
|
|
}
|
|
bv_list = get_bvid(headers)
|
|
all_danmu = fetch_all_danmus(bv_list,headers)
|
|
if all_danmu:
|
|
top_danmu(all_danmu)
|
|
wordcloud_danmu(all_danmu)
|