Compare commits
No commits in common. 'main' and 'master' have entirely different histories.
@ -0,0 +1,119 @@
|
|||||||
|
import time
|
||||||
|
import requests
|
||||||
|
import xml.etree.ElementTree as ET
|
||||||
|
import re
|
||||||
|
import wordcloud
|
||||||
|
import pandas as pd
|
||||||
|
from collections import Counter
|
||||||
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
|
|
||||||
|
#获取视频bv号
|
||||||
|
def get_bvid(headers):
|
||||||
|
bv_list = [] #目的bv列表
|
||||||
|
url = 'https://search.bilibili.com/video?vt=83711075&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=3'
|
||||||
|
for page in range(1, 11):
|
||||||
|
page = str(page)
|
||||||
|
param ={
|
||||||
|
'spm_id_from': '333.1007',
|
||||||
|
'search_source': '3',
|
||||||
|
'page': page,
|
||||||
|
'o': ''
|
||||||
|
}
|
||||||
|
|
||||||
|
bi_text = requests.get(url=url,headers=headers,params=param).text
|
||||||
|
# 正则选出bv号
|
||||||
|
r = re.findall('<div.*?<a href="//www.bilibili.com/video/(.*?)/".*?</div></a>',bi_text)
|
||||||
|
r_list = list(set(r))
|
||||||
|
for index in r_list:
|
||||||
|
bv_list.append(index)
|
||||||
|
time.sleep(1)
|
||||||
|
bv_list = list(set(bv_list))
|
||||||
|
return bv_list
|
||||||
|
|
||||||
|
# 获取视频的 cid
|
||||||
|
def get_cid(bv_id, headers):
|
||||||
|
url = f"https://api.bilibili.com/x/web-interface/view?bvid={bv_id}"
|
||||||
|
response = requests.get(url,headers=headers)
|
||||||
|
data = response.json()
|
||||||
|
return data['data']['cid']
|
||||||
|
|
||||||
|
# 获取弹幕 XML
|
||||||
|
def get_danmu(cid, headers):
|
||||||
|
url = f"https://comment.bilibili.com/{cid}.xml"
|
||||||
|
response = requests.get(url, headers=headers)
|
||||||
|
if response.status_code == 200:
|
||||||
|
return response.content
|
||||||
|
else:
|
||||||
|
print(f"弹幕请求失败,状态码: {response.status_code}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# 解析 XML 并返回弹幕内容
|
||||||
|
def parse_danmu(xml_content):
|
||||||
|
root = ET.fromstring(xml_content)
|
||||||
|
danmu_list = [d.text for d in root.findall('d')]
|
||||||
|
pattern = re.compile(r'([^a-z?]AI[^a-z?])',re.I)
|
||||||
|
danmu_ai_list = [ai for ai in danmu_list if pattern.search(ai)]
|
||||||
|
return danmu_ai_list
|
||||||
|
# 多线程处理弹幕爬取
|
||||||
|
def fetch_danmu(bv_id, headers):
|
||||||
|
cid = get_cid(bv_id, headers)
|
||||||
|
if cid:
|
||||||
|
xml_content = get_danmu(cid,headers)
|
||||||
|
if xml_content:
|
||||||
|
return parse_danmu(xml_content)
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
# 多线程爬取多个视频的弹幕
|
||||||
|
def fetch_all_danmus(bv_list, headers):
|
||||||
|
all_danmu = []
|
||||||
|
|
||||||
|
# 使用多线程池
|
||||||
|
with ThreadPoolExecutor(max_workers=5) as executor:
|
||||||
|
future_to_bv_id = {executor.submit(fetch_danmu, bv_id,headers): bv_id for bv_id in bv_list}
|
||||||
|
|
||||||
|
for future in as_completed(future_to_bv_id):
|
||||||
|
bv_id = future_to_bv_id[future]
|
||||||
|
try:
|
||||||
|
danmus = future.result()
|
||||||
|
if danmus:
|
||||||
|
all_danmu.extend(danmus)
|
||||||
|
except Exception as exc:
|
||||||
|
print(f"{bv_id} 弹幕爬取时发生异常: {exc}")
|
||||||
|
|
||||||
|
return all_danmu
|
||||||
|
|
||||||
|
#统计弹幕数量 导出为excel表格 并输出数量前8名的弹幕
|
||||||
|
def top_danmu(danmu_list):
|
||||||
|
counter = Counter(danmu_list)
|
||||||
|
#导出Excel
|
||||||
|
df = pd.DataFrame(list(counter.items()), columns=['Danmu', 'Count'])
|
||||||
|
df.to_excel('danmu_ai_count.xlsx',index=False)
|
||||||
|
#输出前8名
|
||||||
|
top_danmus = counter.most_common(8)
|
||||||
|
for danmu, count in top_danmus:
|
||||||
|
print(f"{danmu}: {count}")
|
||||||
|
|
||||||
|
#根据弹幕输出词云图
|
||||||
|
def wordcloud_danmu(danmu_list):
|
||||||
|
danmu_string = ''.join(danmu_list)
|
||||||
|
wc = wordcloud.WordCloud(
|
||||||
|
height=300,
|
||||||
|
width=500,
|
||||||
|
background_color='white',
|
||||||
|
font_path='msyh.ttc',
|
||||||
|
scale=15
|
||||||
|
)
|
||||||
|
wc.generate(danmu_string)
|
||||||
|
wc.to_file('1.png')
|
||||||
|
|
||||||
|
#主函数
|
||||||
|
headers = {
|
||||||
|
'Cookie': 'buvid_fp=bf8beba45de9821b7ea7e50612d09908; LIVE_BUVID=AUTO4116559027162689; CURRENT_FNVAL=4048; buvid3=973F36A7-32C2-B0E4-8ACE-5E0B758E2FF782902infoc; b_nut=1701966882; _uuid=CC9ED47C-31078-19D2-10595-CD6B1057A2631005192infoc; buvid4=CE632AC2-E7EE-2F09-30E0-9BF3F5C2D40878774-024042908-KwURbcfX8EBz810RKJAgEw%3D%3D; rpdid=|(u))kkYu|)m0J\'u~uRJ~)J~m; header_theme_version=CLOSE; enable_web_push=DISABLE; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY3NjQ0NTksImlhdCI6MTcyNjUwNTE5OSwicGx0IjotMX0.hMTE7drIcEEwzsI8zYxYXI1o7zzeK2ggwuLaW7EbN2M; bili_ticket_expires=1726764399; bp_t_offset_593983186=978039902421647360; SESSDATA=1a5c072c%2C1742137341%2C9ab0d%2A92CjC2gzBMHSu21uOWgbHRFRswIo1CHBMpBvihPj-9d6qWyU-A5dWmJF_QPAEJPwyIBJ0SVm13Uzk1bENFZGR2bk9nR3ZtaDNmQXJTeE8xVDR5RFZoRkIxS3NmYU9kc01Sb0VKZ0hnMTRjbEhLVHQ3REdqdDVYNm5LLWhYOGp1RE42QjhPS1Ntb3B3IIEC; bili_jct=369030ec2cf370e475f8d79829b8f725; DedeUserID=593983186; DedeUserID__ckMd5=48e87375187f5f74; sid=8vio5fpf; b_lsid=7102AB72D_1920545EF21; bsource=search_bing; bmg_af_switch=1; bmg_src_def_domain=i1.hdslb.com; home_feed_column=4; browser_resolution=659-994',
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0'
|
||||||
|
}
|
||||||
|
bv_list = get_bvid(headers)
|
||||||
|
all_danmu = fetch_all_danmus(bv_list,headers)
|
||||||
|
if all_danmu:
|
||||||
|
top_danmu(all_danmu)
|
||||||
|
wordcloud_danmu(all_danmu)
|
||||||
Binary file not shown.
@ -0,0 +1,3 @@
|
|||||||
|
pandas==2.2.2
|
||||||
|
Requests==2.32.3
|
||||||
|
wordcloud==1.9.3
|
||||||
Loading…
Reference in new issue