You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

163 lines
7.0 KiB

import requests
import re
import pandas as pd
import time
from collections import Counter
import jieba
import wordcloud
import imageio
# 全局变量定义
video_num = 300 # 定义要爬取的视频数量
stopwords_path = r'C:\Users\ly130\Desktop\Homework1\停用词.txt' # 定义停用词文件的路径
ai_keyword_list = ['AI', '智能'] # 定义与AI相关的关键词列表
# 设置请求头和基础URL
headers = {
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
"cookie": "buvid4=B85AEC97-F69C-C637-E3C2-C663D774232B87096-022081317-Mk4wjKcJQ45KiYrEq7vrds65fs3pUAqkDfUodWgIWmefZ%2B8Gkn9QRg%3D%3D; CURRENT_FNVAL=4048; buvid_fp=ae1a9ed67ce4b4b81601b1428b3217f9; rpdid=|(k|lk~umYk)0J'u~um~Y|lku; SESSDATA=19c9c747%2C1735394649%2Cde0b5%2A72CjCGWwVQmSRSWbmZZg7l2IVicrrImP6KbqADlDdybFU35A2pNwxPiQvGuSWVPFCvg4sSVjNCOVZTU3ZjeXhGcTYwc1RZSWo1U2E2clc2WkJwdE0zREh2SnRnaldDcGhDUTZOVnF2TVdIWVZvVXBuNnJGeWs5MlhvNFdraGZpaGFwbVNvSXlWSERRIIEC; bili_jct=b7b181986a9d1a649b30eb2fa5929d52; DedeUserID=1604567014; DedeUserID__ckMd5=e19ded6239c7cd47; buvid3=3D25DBF7-1426-D2BB-E102-DB9D3B6CE68458040infoc; b_nut=1719933858; enable_web_push=DISABLE; header_theme_version=CLOSE; LIVE_BUVID=AUTO8417216581143780; fingerprint=4a6a977133f772625df973e6fe4b955c; bp_video_offset_1604567014=968870267503771648; hit-dyn-v2=1; CURRENT_QUALITY=80; _uuid=DEA107613-17109-A8AA-9E27-E8BA6FBFC103E89699infoc; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjU3OTUyODcsImlhdCI6MTcyNTUzNjAyNywicGx0IjotMX0.4xQbg9vPDx7yeSRK4xst2CzX32eMjPSMdlfFyeUn6_g; bili_ticket_expires=1725795227; PVID=8; bp_t_offset_1604567014=974079272559837184; b_lsid=351B10593_191C81CD95D; home_feed_column=4; browser_resolution=978-740; sid=6zbnnd1f"
}
base_url = "https://search.bilibili.com/video?keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=3"
def get_bv(max_videos):
"""从Bilibili搜索结果中获取视频的BV号。
Args:
max_videos (int): 最大视频数量
Returns:
list: 视频的BV号列表
"""
bv_set = set()
for index in range(1, 11):
if len(bv_set) >= max_videos:
break
searchpage_url = base_url + f"&page={index}&page_size=30"
try:
response = requests.get(searchpage_url, headers=headers)
response.raise_for_status()
data = response.text
bv_set.update(re.findall('href="//www.bilibili.com/video/(.*?)/"', data))
except requests.RequestException as e:
print(f"请求错误: {e}")
bv_list = list(bv_set)[:max_videos]
print("爬取bv号完成", len(bv_list), "个bv号")
return bv_list
def get_oid(bv_list):
"""根据BV号获取视频的OID号。
Args:
bv_list (list): 视频的BV号列表
Returns:
list: 视频的OID号列表
"""
oids = []
for bv in bv_list:
video_url = f"https://api.bilibili.com/x/player/pagelist?bvid={bv}&jsonp=jsonp"
try:
response = requests.get(video_url, headers=headers)
response.raise_for_status()
data = response.text
oid_match = re.search('"cid":(.*?),"page":', data)
if oid_match:
oids.append(oid_match.group(1))
except requests.RequestException as e:
print(f"请求错误: {e}")
return oids
def get_danmu(oids):
"""根据OID号获取视频的弹幕内容。
Args:
oids (list): 视频的OID号列表
Returns:
list: 弹幕内容列表
"""
content_new = []
for idx, cid in enumerate(oids, start=1):
danmu_url = f'https://api.bilibili.com/x/v1/dm/list.so?oid={cid}'
try:
response = requests.get(danmu_url, headers=headers)
response.raise_for_status()
response.encoding = response.apparent_encoding
data = response.text
danmu_list = re.findall(r'<d p=".*?">(.*?)</d>', data)
content_new.extend(danmu_list)
print("爬取视频", idx, "弹幕, 获取到:", len(danmu_list), "条弹幕")
time.sleep(0.5) # 为了安全爬取,设置延时
except requests.RequestException as e:
print(f"请求错误: {e}")
return content_new
def save_danmu(content_new):
"""将弹幕内容保存到文本文件和Excel文件中。
Args:
danmu_list (list): 弹幕内容列表
"""
processed_text = []
for word in content_new:
word = re.findall(r'[^\x00-\x1F\x7F]+', word)
processed_text.extend(word)
print(processed_text)
with open('弹幕.txt', 'w', encoding='utf-8') as f:
f.write('\n'.join(processed_text))
df = pd.DataFrame({'弹幕内容': processed_text})
df.to_excel('所有弹幕.xlsx', index=False)
print("弹幕保存完成")
def get_ai_danmu(content):
"""筛选出与AI相关的弹幕内容并进行排序将前8个输出到Excel文件中。
Args:
content (list): 弹幕内容列表
"""
ai_danmu = [word for word in content if any(keyword in word for keyword in ai_keyword_list)]
word_count = Counter(ai_danmu).most_common(8)
df = pd.DataFrame({'词频': [i[1] for i in word_count], '词语': [i[0] for i in word_count]})
df.to_excel('AI弹幕词频统计.xlsx', index=False)
print("AI弹幕已保存")
def load_stopwords(path):
"""从文件中加载停用词。
Args:
path (str): 停用词文件的路径
Returns:
set: 停用词集合
"""
try:
with open(path, 'r', encoding='utf-8') as f:
return set(line.strip() for line in f)
except FileNotFoundError:
print("停用词文件未找到")
return set()
def generate_word_cloud(text, stopwords_path):
"""根据弹幕内容生成词云。
Args:
text (str): 弹幕文本内容
stopwords_path (str): 停用词文件的路径
"""
stopwords = load_stopwords(stopwords_path)
words = jieba.lcut(text)
c = Counter(word for word in words if len(word) > 1 and word not in stopwords)
word_list = c.most_common(200)
image = imageio.v2.imread(r'C:\Users\ly130\Desktop\Homework1\mask.png')
wc = wordcloud.WordCloud(width=image.shape[0], height=image.shape[1], background_color='white',
font_path='msyh.ttc', mask=image, scale=3)
wc.generate_from_frequencies(dict(word_list))
wc.to_file(r'C:\Users\ly130\Desktop\Homework1\word_cloud123.png')
if __name__ == '__main__':
bv_list = get_bv(video_num)
oid_list = get_oid(bv_list)
content_new = get_danmu(oid_list)
save_danmu(content_new)
get_ai_danmu(content_new)
with open('弹幕.txt', 'r', encoding='utf-8') as f:
text = f.read()
generate_word_cloud(text, stopwords_path)