Compare commits
No commits in common. 'master' and 'main' have entirely different histories.
@ -1,119 +0,0 @@
|
||||
import time
|
||||
import requests
|
||||
import xml.etree.ElementTree as ET
|
||||
import re
|
||||
import wordcloud
|
||||
import pandas as pd
|
||||
from collections import Counter
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
|
||||
#获取视频bv号
|
||||
def get_bvid(headers):
|
||||
bv_list = [] #目的bv列表
|
||||
url = 'https://search.bilibili.com/video?vt=83711075&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=3'
|
||||
for page in range(1, 11):
|
||||
page = str(page)
|
||||
param ={
|
||||
'spm_id_from': '333.1007',
|
||||
'search_source': '3',
|
||||
'page': page,
|
||||
'o': ''
|
||||
}
|
||||
|
||||
bi_text = requests.get(url=url,headers=headers,params=param).text
|
||||
# 正则选出bv号
|
||||
r = re.findall('<div.*?<a href="//www.bilibili.com/video/(.*?)/".*?</div></a>',bi_text)
|
||||
r_list = list(set(r))
|
||||
for index in r_list:
|
||||
bv_list.append(index)
|
||||
time.sleep(1)
|
||||
bv_list = list(set(bv_list))
|
||||
return bv_list
|
||||
|
||||
# 获取视频的 cid
|
||||
def get_cid(bv_id, headers):
|
||||
url = f"https://api.bilibili.com/x/web-interface/view?bvid={bv_id}"
|
||||
response = requests.get(url,headers=headers)
|
||||
data = response.json()
|
||||
return data['data']['cid']
|
||||
|
||||
# 获取弹幕 XML
|
||||
def get_danmu(cid, headers):
|
||||
url = f"https://comment.bilibili.com/{cid}.xml"
|
||||
response = requests.get(url, headers=headers)
|
||||
if response.status_code == 200:
|
||||
return response.content
|
||||
else:
|
||||
print(f"弹幕请求失败,状态码: {response.status_code}")
|
||||
return None
|
||||
|
||||
# 解析 XML 并返回弹幕内容
|
||||
def parse_danmu(xml_content):
|
||||
root = ET.fromstring(xml_content)
|
||||
danmu_list = [d.text for d in root.findall('d')]
|
||||
pattern = re.compile(r'([^a-z?]AI[^a-z?])',re.I)
|
||||
danmu_ai_list = [ai for ai in danmu_list if pattern.search(ai)]
|
||||
return danmu_ai_list
|
||||
# 多线程处理弹幕爬取
|
||||
def fetch_danmu(bv_id, headers):
|
||||
cid = get_cid(bv_id, headers)
|
||||
if cid:
|
||||
xml_content = get_danmu(cid,headers)
|
||||
if xml_content:
|
||||
return parse_danmu(xml_content)
|
||||
return []
|
||||
|
||||
|
||||
# 多线程爬取多个视频的弹幕
|
||||
def fetch_all_danmus(bv_list, headers):
|
||||
all_danmu = []
|
||||
|
||||
# 使用多线程池
|
||||
with ThreadPoolExecutor(max_workers=5) as executor:
|
||||
future_to_bv_id = {executor.submit(fetch_danmu, bv_id,headers): bv_id for bv_id in bv_list}
|
||||
|
||||
for future in as_completed(future_to_bv_id):
|
||||
bv_id = future_to_bv_id[future]
|
||||
try:
|
||||
danmus = future.result()
|
||||
if danmus:
|
||||
all_danmu.extend(danmus)
|
||||
except Exception as exc:
|
||||
print(f"{bv_id} 弹幕爬取时发生异常: {exc}")
|
||||
|
||||
return all_danmu
|
||||
|
||||
#统计弹幕数量 导出为excel表格 并输出数量前8名的弹幕
|
||||
def top_danmu(danmu_list):
|
||||
counter = Counter(danmu_list)
|
||||
#导出Excel
|
||||
df = pd.DataFrame(list(counter.items()), columns=['Danmu', 'Count'])
|
||||
df.to_excel('danmu_ai_count.xlsx',index=False)
|
||||
#输出前8名
|
||||
top_danmus = counter.most_common(8)
|
||||
for danmu, count in top_danmus:
|
||||
print(f"{danmu}: {count}")
|
||||
|
||||
#根据弹幕输出词云图
|
||||
def wordcloud_danmu(danmu_list):
|
||||
danmu_string = ''.join(danmu_list)
|
||||
wc = wordcloud.WordCloud(
|
||||
height=300,
|
||||
width=500,
|
||||
background_color='white',
|
||||
font_path='msyh.ttc',
|
||||
scale=15
|
||||
)
|
||||
wc.generate(danmu_string)
|
||||
wc.to_file('1.png')
|
||||
|
||||
#主函数
|
||||
headers = {
|
||||
'Cookie': 'buvid_fp=bf8beba45de9821b7ea7e50612d09908; LIVE_BUVID=AUTO4116559027162689; CURRENT_FNVAL=4048; buvid3=973F36A7-32C2-B0E4-8ACE-5E0B758E2FF782902infoc; b_nut=1701966882; _uuid=CC9ED47C-31078-19D2-10595-CD6B1057A2631005192infoc; buvid4=CE632AC2-E7EE-2F09-30E0-9BF3F5C2D40878774-024042908-KwURbcfX8EBz810RKJAgEw%3D%3D; rpdid=|(u))kkYu|)m0J\'u~uRJ~)J~m; header_theme_version=CLOSE; enable_web_push=DISABLE; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY3NjQ0NTksImlhdCI6MTcyNjUwNTE5OSwicGx0IjotMX0.hMTE7drIcEEwzsI8zYxYXI1o7zzeK2ggwuLaW7EbN2M; bili_ticket_expires=1726764399; bp_t_offset_593983186=978039902421647360; SESSDATA=1a5c072c%2C1742137341%2C9ab0d%2A92CjC2gzBMHSu21uOWgbHRFRswIo1CHBMpBvihPj-9d6qWyU-A5dWmJF_QPAEJPwyIBJ0SVm13Uzk1bENFZGR2bk9nR3ZtaDNmQXJTeE8xVDR5RFZoRkIxS3NmYU9kc01Sb0VKZ0hnMTRjbEhLVHQ3REdqdDVYNm5LLWhYOGp1RE42QjhPS1Ntb3B3IIEC; bili_jct=369030ec2cf370e475f8d79829b8f725; DedeUserID=593983186; DedeUserID__ckMd5=48e87375187f5f74; sid=8vio5fpf; b_lsid=7102AB72D_1920545EF21; bsource=search_bing; bmg_af_switch=1; bmg_src_def_domain=i1.hdslb.com; home_feed_column=4; browser_resolution=659-994',
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0'
|
||||
}
|
||||
bv_list = get_bvid(headers)
|
||||
all_danmu = fetch_all_danmus(bv_list,headers)
|
||||
if all_danmu:
|
||||
top_danmu(all_danmu)
|
||||
wordcloud_danmu(all_danmu)
|
||||
Binary file not shown.
@ -1,3 +0,0 @@
|
||||
pandas==2.2.2
|
||||
Requests==2.32.3
|
||||
wordcloud==1.9.3
|
||||
Loading…
Reference in new issue