diff --git a/scrape_2.py b/scrape_2.py deleted file mode 100644 index b9a7c89..0000000 --- a/scrape_2.py +++ /dev/null @@ -1,155 +0,0 @@ -import requests -import re -import time -import pandas as pd -import matplotlib.pyplot as plt -from wordcloud import WordCloud -from collections import Counter - -headers = { - 'cookie':'CURRENT_FNVAL=4048; buvid4=CDB22228-76EA-BC93-F037-78FC6CEC077D36275-023090719-X83v1qigvaVs%2BeTu3%2F5T2g%3D%3D; rpdid=|(u)luk)m|)R0J\'uYmR|)Y)J); enable_web_push=DISABLE; header_theme_version=CLOSE; DedeUserID=567151924; DedeUserID__ckMd5=3d51b3cb3879b2e0; PVID=1; buvid3=909EA327-0037-5349-5CF4-5B2C4EF5300103254infoc; b_nut=1726374703; bsource=search_bing; _uuid=A28E1582-10535-12B8-49B2-D1F1026722D1607876infoc; buvid_fp=1ed033837a881c0f6fee6ce1ae293ed0; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY2MzM5MDQsImlhdCI6MTcyNjM3NDY0NCwicGx0IjotMX0.52ZE1RDG5tSqHh-ZOGgEHDzj6W1UyOtiMkcxw_a2WNY; bili_ticket_expires=1726633844; SESSDATA=701189c6%2C1741926705%2C58e19%2A91CjCANFin8L-nRK6CjxH9_BSgRe6HHUSWybFilZklu8yORRObfCV2cnJswJPECKKy1UcSVkMtcE4ydHItZF9lOW43ZFpyelRVVEUzZUVCdlh6S2ltWWJIaTg0MU1DclRIeG8wbE84cE1pSFBkOXA1alNxTkp3bDJuLWNCN2IzV2JXX2p4SGIxaW9BIIEC; bili_jct=98cbd1d0535939dc4a5c474a44d27ad7; sid=7wfo5xb3; home_feed_column=5; browser_resolution=1432-776; CURRENT_QUALITY=80; bp_t_offset_567151924=977713175669506048; b_lsid=811052DA4_191FB79926A', - 'Referer': 'https://search.bilibili.com/all?vt=82484714&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=5', - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0' -} - -def get_response(url): - response = requests.get(url = url, headers = headers) - return response - -def get_bv(page_num): - bv_list = [] - link = 'https://api.bilibili.com/x/web-interface/wbi/search/type' - data = { - 'category_id':'', - 'search_type': 'video', - 'ad_resource': '5654', - '__refresh__': 'true', - '_extra':'', - 'context':'', - 'page': page_num, - 'page_size': '42', - 'pubtime_begin_s': '0', - 'pubtime_end_s': '0', - 'from_source':'', - 'from_spmid': '333.337', - 'platform': 'pc', - 'highlight': '1', - 'single_column': '0', - 'keyword': '2024巴黎奥运会', - 'qv_id': 'Hz50pRYmKFQYlX2AorY3bJUTNJbLRwnX', - 'source_tag': '3', - 'gaia_vtoken':'', - 'dynamic_offset': '30', - 'web_location': '1430654', - 'w_rid': '1b994979977a17ee8010f012d43fa7b6', - 'wts': '1726501056' - } - link_data = requests.get(url = link, headers = headers, params = data).json() - for index in link_data['data']['result']: - bv_list.append(index['bvid']) -# pprint(bv_list) - return bv_list - -def get_cid(bvid): - url = f"https://api.bilibili.com/x/web-interface/view?bvid={bvid}" - data = get_response(url).json() -# pprint(data) - return data['data']['cid'] - -def get_danmaku(cid): - url = f"https://comment.bilibili.com/{cid}.xml" - response = get_response(url) - response.encoding = 'utf-8' -# pprint(response.text) - return response.text - -def parse_danmaku(danmaku_xml): - danmaku_list = re.findall('">(.*?)',danmaku_xml) - return danmaku_list - -# 弹幕相关关键词 -ai_keywords = ["AI", "人工智能", "机器学习", "深度学习", "AI技术", "自动驾驶", "智能", "图像识别", "AI应用","智造"] - -# 筛选与AI相关的弹幕 -def get_ai_danmaku(danmaku_list_all, ai_keywords): - ai_related_danmaku = [danmaku for danmaku in danmaku_list_all if any(keyword in danmaku for keyword in ai_keywords)] - return ai_related_danmaku - -# 统计弹幕出现频次 -def count_danmaku(danmaku_list): - danmaku_counter = Counter(danmaku_list) - return danmaku_counter - -# 获取前n个弹幕及其出现次数 -def get_top_n_danmaku(danmaku_counter, n=8): - return danmaku_counter.most_common(n) - -# 写入 Excel -def write_to_excel(data, filename='danmaku_AI_top8.xlsx'): - df = pd.DataFrame(data, columns=['弹幕内容', '出现次数']) - df.to_excel(filename, index=False) - -# 生成并显示词云图 -def get_wordcloud(danmaku_counter): - wordcloud = WordCloud( - width=800, # 宽度 - height=400, # 高度 - background_color='white', # 背景色 - max_words=100, # 显示的最大词语数量 - colormap='viridis', # 颜色映射 - font_path='msyh.ttc' # 指定字体路径,适应中文显示 - ).generate_from_frequencies(danmaku_counter) - plt.figure(figsize=(10, 5)) # 图像大小 - plt.imshow(wordcloud, interpolation="bilinear") - plt.axis("off") # 关闭坐标轴显示 - plt.show() - -if __name__ == '__main__': - - # 收集前300个视频的bvid - videos = [] - page_num = 1 - while len(videos) < 300: - bv_list = get_bv(str(page_num)) - videos.extend(bv_list) - page_num += 1 - time.sleep(1) # 防止请求过于频繁 - - # 仅保留前300个视频号 - videos = videos[:300] - - all_danmaku = [] - all_ai_related_danmaku = [] - - # 获取每个视频的弹幕并打印 - for bvid in videos: - try: - cid = get_cid(bvid) - danmaku_xml = get_danmaku(cid) - danmaku_list = parse_danmaku(danmaku_xml) - - # 筛选AI相关弹幕 - ai_related_danmaku = get_ai_danmaku(danmaku_list, ai_keywords) - all_ai_related_danmaku.extend(ai_related_danmaku) - - for danmaku in danmaku_list: - all_danmaku.append(danmaku) - except Exception as e: - print(f"Error fetching danmaku for video {bvid}: {e}") - time.sleep(1) # 防止请求过于频繁 - - # 统计ai弹幕出现频次 - ai_danmaku_counter = count_danmaku(all_ai_related_danmaku) - - # 获取前8个频次最高的弹幕 - top_8_danmaku = get_top_n_danmaku(ai_danmaku_counter, n=8) - for dm,cnt in top_8_danmaku: - print(dm) - - # 输出到Excel - write_to_excel(top_8_danmaku) - - # 生成词云 - get_wordcloud(ai_danmaku_counter) - -# print("弹幕统计结果已保存到Excel: danmaku_AI_top8.xlsx")