import time import requests import xml.etree.ElementTree as ET import re import wordcloud import pandas as pd from collections import Counter from concurrent.futures import ThreadPoolExecutor, as_completed #获取视频bv号 def get_bvid(headers): bv_list = [] #目的bv列表 url = 'https://search.bilibili.com/video?vt=83711075&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=3' for page in range(1, 11): page = str(page) param ={ 'spm_id_from': '333.1007', 'search_source': '3', 'page': page, 'o': '' } bi_text = requests.get(url=url,headers=headers,params=param).text # 正则选出bv号 r = re.findall('',bi_text) r_list = list(set(r)) for index in r_list: bv_list.append(index) time.sleep(1) bv_list = list(set(bv_list)) return bv_list # 获取视频的 cid def get_cid(bv_id, headers): url = f"https://api.bilibili.com/x/web-interface/view?bvid={bv_id}" response = requests.get(url,headers=headers) data = response.json() return data['data']['cid'] # 获取弹幕 XML def get_danmu(cid, headers): url = f"https://comment.bilibili.com/{cid}.xml" response = requests.get(url, headers=headers) if response.status_code == 200: return response.content else: print(f"弹幕请求失败,状态码: {response.status_code}") return None # 解析 XML 并返回弹幕内容 def parse_danmu(xml_content): root = ET.fromstring(xml_content) danmu_list = [d.text for d in root.findall('d')] pattern = re.compile(r'([^a-z?]AI[^a-z?])',re.I) danmu_ai_list = [ai for ai in danmu_list if pattern.search(ai)] return danmu_ai_list # 多线程处理弹幕爬取 def fetch_danmu(bv_id, headers): cid = get_cid(bv_id, headers) if cid: xml_content = get_danmu(cid,headers) if xml_content: return parse_danmu(xml_content) return [] # 多线程爬取多个视频的弹幕 def fetch_all_danmus(bv_list, headers): all_danmu = [] # 使用多线程池 with ThreadPoolExecutor(max_workers=5) as executor: future_to_bv_id = {executor.submit(fetch_danmu, bv_id,headers): bv_id for bv_id in bv_list} for future in as_completed(future_to_bv_id): bv_id = future_to_bv_id[future] try: danmus = future.result() if danmus: all_danmu.extend(danmus) except Exception as exc: print(f"{bv_id} 弹幕爬取时发生异常: {exc}") return all_danmu #统计弹幕数量 导出为excel表格 并输出数量前8名的弹幕 def top_danmu(danmu_list): counter = Counter(danmu_list) #导出Excel df = pd.DataFrame(list(counter.items()), columns=['Danmu', 'Count']) df.to_excel('danmu_ai_count.xlsx',index=False) #输出前8名 top_danmus = counter.most_common(8) for danmu, count in top_danmus: print(f"{danmu}: {count}") #根据弹幕输出词云图 def wordcloud_danmu(danmu_list): danmu_string = ''.join(danmu_list) wc = wordcloud.WordCloud( height=300, width=500, background_color='white', font_path='msyh.ttc', scale=15 ) wc.generate(danmu_string) wc.to_file('1.png') #主函数 headers = { 'Cookie': 'buvid_fp=bf8beba45de9821b7ea7e50612d09908; LIVE_BUVID=AUTO4116559027162689; CURRENT_FNVAL=4048; buvid3=973F36A7-32C2-B0E4-8ACE-5E0B758E2FF782902infoc; b_nut=1701966882; _uuid=CC9ED47C-31078-19D2-10595-CD6B1057A2631005192infoc; buvid4=CE632AC2-E7EE-2F09-30E0-9BF3F5C2D40878774-024042908-KwURbcfX8EBz810RKJAgEw%3D%3D; rpdid=|(u))kkYu|)m0J\'u~uRJ~)J~m; header_theme_version=CLOSE; enable_web_push=DISABLE; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY3NjQ0NTksImlhdCI6MTcyNjUwNTE5OSwicGx0IjotMX0.hMTE7drIcEEwzsI8zYxYXI1o7zzeK2ggwuLaW7EbN2M; bili_ticket_expires=1726764399; bp_t_offset_593983186=978039902421647360; SESSDATA=1a5c072c%2C1742137341%2C9ab0d%2A92CjC2gzBMHSu21uOWgbHRFRswIo1CHBMpBvihPj-9d6qWyU-A5dWmJF_QPAEJPwyIBJ0SVm13Uzk1bENFZGR2bk9nR3ZtaDNmQXJTeE8xVDR5RFZoRkIxS3NmYU9kc01Sb0VKZ0hnMTRjbEhLVHQ3REdqdDVYNm5LLWhYOGp1RE42QjhPS1Ntb3B3IIEC; bili_jct=369030ec2cf370e475f8d79829b8f725; DedeUserID=593983186; DedeUserID__ckMd5=48e87375187f5f74; sid=8vio5fpf; b_lsid=7102AB72D_1920545EF21; bsource=search_bing; bmg_af_switch=1; bmg_src_def_domain=i1.hdslb.com; home_feed_column=4; browser_resolution=659-994', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0' } bv_list = get_bvid(headers) all_danmu = fetch_all_danmus(bv_list,headers) if all_danmu: top_danmu(all_danmu) wordcloud_danmu(all_danmu)