From 4a095c9681f26ffca5ac5d567b6377ff6dabd7b2 Mon Sep 17 00:00:00 2001 From: pg78ac9nr <2401677463@qq.com> Date: Wed, 18 Sep 2024 20:44:11 +0800 Subject: [PATCH] ADD file via upload --- 弹幕获取.py | 76 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 弹幕获取.py diff --git a/弹幕获取.py b/弹幕获取.py new file mode 100644 index 0000000..8334b58 --- /dev/null +++ b/弹幕获取.py @@ -0,0 +1,76 @@ +import requests +from bs4 import BeautifulSoup +import pandas as pd +import re +from wordcloud import WordCloud +import matplotlib.pyplot as plt +from openpyxl import Workbook + +def fetch_videos(keyword): + """ + 根据关键词搜索视频,并返回前300个视频的ID列表。 + :param keyword: 搜索关键词 + :return: 视频ID列表 + """ + url = "https://search.bilibili.com/all" + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3' + } + params = { + 'keyword': keyword, + 'order': 'totalrank', + 'page': '1' + } + try: + response = requests.get(url, headers=headers, params=params) + response.raise_for_status() # 抛出HTTP错误 + soup = BeautifulSoup(response.text, 'html.parser') + video_ids = [a['href'].split('/')[-1] for a in soup.select('.video-item .title') if a.has_attr('href')] + print(f"Fetched {len(video_ids)} video IDs.") + return video_ids[:300] + except Exception as e: + print(f"Error fetching videos: {e}") + return [] + +def fetch_danmaku(video_id): + """ + 获取指定视频的弹幕数据。 + :param video_id: 视频ID + :return: 弹幕列表 + """ + url = f"https://api.bilibili.com/x/v2/dm/web/seg.so?type=1&oid={video_id}" + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3', + 'Cookie': 'buvid3=0C047DB7-FB67-6565-B853-68B19196AEE053166infoc; buvid4=D2E32722-EB31-8B5B-8BC7-420F049CDE3657801-022071821-mG8+jYWtWHQ35A9yqIgZIA%3D%3D; buvid_fp=60e37bdf4fe67cde89d283db25adff46; _uuid=FCEA6C48-BB82-123A-61106-3F5410106BB410B03170infoc; b_nut=100; header_theme_version=CLOSE; enable_web_push=DISABLE; bmg_af_switch=1; bmg_src_def_domain=i0.hdslb.com; bsource=search_bing; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY5MTczNjUsImlhdCI6MTcyNjY1ODEwNSwicGx0IjotMX0.Aa-1_tfEk0rFqyzRFZ-vIsSUSbvUfyR7woQA3IC3h0s; bili_ticket_expires=1726917305; CURRENT_FNVAL=4048; SESSDATA=aa6a6590%2C1742210524%2C7b1c4%2A92CjCxud8rqp6tuF7AYkzmJF0YS7_L4_80iMI3NuY5q-M7BEW3cf0_bVyhIcnZMJapP7YSVnJiQ2NVcTJZZ1ZIMFduRURJXzZXOWtaTTl2WnBFSHkwckM0UzdwY2xHMG9MNVl4c1pUSHlFaFJ4RnQ5WjY3ZHRtcm5qcDhNSVo3eXZORDczc0VlYlF3IIEC; bili_jct=5232d057d308c18c1419d19271a3b85e; DedeUserID=1576579979; DedeUserID__ckMd5=da7d6054e70acbba; b_lsid=674C255B_19204FCE528; sid=g83mkv7a; home_feed_column=4; browser_resolution=435-748', # Cookie值 + 'Referer': f'https://www.bilibili.com/video/{video_id}', + } + try: + response = requests.get(url, headers=headers) + response.raise_for_status() # 抛出HTTP错误 + data = response.json() + danmakus = [re.sub(r'<[^>]+>', '', dm['content']) for dm in data['data']['dm_seg_list']] + print(f"Fetched {len(danmakus)} danmakus for video {video_id}.") + return danmakus + except requests.RequestException as e: + print(f"Error fetching danmakus for video {video_id}: {e}") + return [] + except KeyError as e: + print(f"Invalid response format for video {video_id}: {e}") + return [] + +def analyze_danmakus(danmakus): + """ + 分析弹幕数据,统计与AI技术应用相关的弹幕数量。 + :param danmakus: 弹幕列表 + :return: 弹幕数量的字典 + """ + ai_keywords = ['AI', '人工智能', '机器学习', '深度学习', '算法', '大数据', '智能', '技术'] + keyword_counts = {} + for danmaku in danmakus: + for keyword in ai_keywords: + if keyword in danmaku: + if danmaku in keyword_counts: + keyword_counts[danmaku] += 1 + else: + keyword_counts[danmaku] = 1 + return keyword_counts \ No newline at end of file