|
|
|
@ -0,0 +1,173 @@
|
|
|
|
|
import requests
|
|
|
|
|
import re
|
|
|
|
|
import pandas as pd
|
|
|
|
|
import time
|
|
|
|
|
from collections import Counter
|
|
|
|
|
import jieba
|
|
|
|
|
import wordcloud
|
|
|
|
|
import imageio
|
|
|
|
|
|
|
|
|
|
# 全局变量定义
|
|
|
|
|
video_num = 300 # 定义要爬取的视频数量
|
|
|
|
|
stopwords_path = r'C:\Users\ly130\Desktop\Homework1\停用词.txt' # 定义停用词文件的路径
|
|
|
|
|
ai_keyword_list = ['AI', '智能', '深度学习', '自然语言处理', '语音识别', '语音合成', '图像识别', '图像处理'] # 定义与AI相关的关键词列表
|
|
|
|
|
|
|
|
|
|
# 设置请求头和基础URL
|
|
|
|
|
headers = {
|
|
|
|
|
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
|
|
|
"cookie": "buvid4=B85AEC97-F69C-C637-E3C2-C663D774232B87096-022081317-Mk4wjKcJQ45KiYrEq7vrds65fs3pUAqkDfUodWgIWmefZ%2B8Gkn9QRg%3D%3D; CURRENT_FNVAL=4048; buvid_fp=ae1a9ed67ce4b4b81601b1428b3217f9; rpdid=|(k|lk~umYk)0J'u~um~Y|lku; SESSDATA=19c9c747%2C1735394649%2Cde0b5%2A72CjCGWwVQmSRSWbmZZg7l2IVicrrImP6KbqADlDdybFU35A2pNwxPiQvGuSWVPFCvg4sSVjNCOVZTU3ZjeXhGcTYwc1RZSWo1U2E2clc2WkJwdE0zREh2SnRnaldDcGhDUTZOVnF2TVdIWVZvVXBuNnJGeWs5MlhvNFdraGZpaGFwbVNvSXlWSERRIIEC; bili_jct=b7b181986a9d1a649b30eb2fa5929d52; DedeUserID=1604567014; DedeUserID__ckMd5=e19ded6239c7cd47; buvid3=3D25DBF7-1426-D2BB-E102-DB9D3B6CE68458040infoc; b_nut=1719933858; enable_web_push=DISABLE; header_theme_version=CLOSE; LIVE_BUVID=AUTO8417216581143780; fingerprint=4a6a977133f772625df973e6fe4b955c; bp_video_offset_1604567014=968870267503771648; hit-dyn-v2=1; CURRENT_QUALITY=80; _uuid=DEA107613-17109-A8AA-9E27-E8BA6FBFC103E89699infoc; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjU3OTUyODcsImlhdCI6MTcyNTUzNjAyNywicGx0IjotMX0.4xQbg9vPDx7yeSRK4xst2CzX32eMjPSMdlfFyeUn6_g; bili_ticket_expires=1725795227; PVID=8; bp_t_offset_1604567014=974079272559837184; b_lsid=351B10593_191C81CD95D; home_feed_column=4; browser_resolution=978-740; sid=6zbnnd1f"
|
|
|
|
|
}
|
|
|
|
|
base_url = "https://search.bilibili.com/video?keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=3"
|
|
|
|
|
|
|
|
|
|
def get_bv(max_videos):
|
|
|
|
|
"""从Bilibili搜索结果中获取视频的BV号。
|
|
|
|
|
Args:
|
|
|
|
|
max_videos (int): 最大视频数量。
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
list: 视频的BV号列表。
|
|
|
|
|
"""
|
|
|
|
|
bv_set = set()
|
|
|
|
|
for index in range(1, 11):
|
|
|
|
|
if len(bv_set) >= max_videos:
|
|
|
|
|
break
|
|
|
|
|
searchpage_url = base_url + f"&page={index}&page_size=30"
|
|
|
|
|
try:
|
|
|
|
|
response = requests.get(searchpage_url, headers=headers)
|
|
|
|
|
response.raise_for_status()
|
|
|
|
|
data = response.text
|
|
|
|
|
bv_set.update(re.findall('href="//www.bilibili.com/video/(.*?)/"', data))
|
|
|
|
|
except requests.RequestException as e:
|
|
|
|
|
print(f"请求错误: {e}")
|
|
|
|
|
bv_list = list(bv_set)[:max_videos]
|
|
|
|
|
print("爬取bv号完成,共", len(bv_list), "个bv号")
|
|
|
|
|
return bv_list
|
|
|
|
|
|
|
|
|
|
def get_oid(bv_list):
|
|
|
|
|
"""根据BV号获取视频的OID号。
|
|
|
|
|
Args:
|
|
|
|
|
bv_list (list): 视频的BV号列表。
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
list: 视频的OID号列表。
|
|
|
|
|
"""
|
|
|
|
|
oids = []
|
|
|
|
|
for bv in bv_list:
|
|
|
|
|
video_url = f"https://api.bilibili.com/x/player/pagelist?bvid={bv}&jsonp=jsonp"
|
|
|
|
|
try:
|
|
|
|
|
response = requests.get(video_url, headers=headers)
|
|
|
|
|
response.raise_for_status()
|
|
|
|
|
data = response.text
|
|
|
|
|
oid_match = re.search('"cid":(.*?),"page":', data)
|
|
|
|
|
if oid_match:
|
|
|
|
|
oids.append(oid_match.group(1))
|
|
|
|
|
except requests.RequestException as e:
|
|
|
|
|
print(f"请求错误: {e}")
|
|
|
|
|
return oids
|
|
|
|
|
|
|
|
|
|
def get_danmu(oids):
|
|
|
|
|
"""根据OID号获取视频的弹幕内容。
|
|
|
|
|
Args:
|
|
|
|
|
oids (list): 视频的OID号列表。
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
list: 弹幕内容列表。
|
|
|
|
|
"""
|
|
|
|
|
content_new = []
|
|
|
|
|
for idx, cid in enumerate(oids, start=1):
|
|
|
|
|
danmu_url = f'https://api.bilibili.com/x/v1/dm/list.so?oid={cid}'
|
|
|
|
|
try:
|
|
|
|
|
response = requests.get(danmu_url, headers=headers)
|
|
|
|
|
response.raise_for_status()
|
|
|
|
|
response.encoding = response.apparent_encoding
|
|
|
|
|
data = response.text
|
|
|
|
|
danmu_list = re.findall(r'<d p=".*?">(.*?)</d>', data)
|
|
|
|
|
print(danmu_list)
|
|
|
|
|
content_new.extend(danmu_list)
|
|
|
|
|
print("爬取视频", idx, "弹幕, 获取到:", len(danmu_list), "条弹幕")
|
|
|
|
|
time.sleep(0.5) # 为了安全爬取,设置延时
|
|
|
|
|
except requests.RequestException as e:
|
|
|
|
|
print(f"请求错误: {e}")
|
|
|
|
|
return content_new
|
|
|
|
|
|
|
|
|
|
def save_danmu(content_new):
|
|
|
|
|
"""将弹幕内容保存到文本文件和Excel文件中。
|
|
|
|
|
Args:
|
|
|
|
|
danmu_list (list): 弹幕内容列表。
|
|
|
|
|
"""
|
|
|
|
|
processed_text = []
|
|
|
|
|
for word in content_new:
|
|
|
|
|
word = re.findall(r'[^\x00-\x1F\x7F]+', word)
|
|
|
|
|
processed_text.extend(word)
|
|
|
|
|
print(processed_text)
|
|
|
|
|
with open('弹幕.txt', 'w', encoding='utf-8') as f:
|
|
|
|
|
f.write('\n'.join(processed_text))
|
|
|
|
|
df = pd.DataFrame({'弹幕内容': processed_text})
|
|
|
|
|
df.to_excel('所有弹幕.xlsx', index=False)
|
|
|
|
|
print("弹幕保存完成")
|
|
|
|
|
|
|
|
|
|
def top_eight_words(content):
|
|
|
|
|
"""统计弹幕内容中的前八个最频繁出现的词。
|
|
|
|
|
Args:
|
|
|
|
|
content (list): 弹幕内容列表。
|
|
|
|
|
"""
|
|
|
|
|
word_count = Counter(content).most_common(8)
|
|
|
|
|
print("前八个词频:", word_count)
|
|
|
|
|
df = pd.DataFrame({'词频': [i[1] for i in word_count], '词语': [i[0] for i in word_count]})
|
|
|
|
|
df.to_excel('词频统计.xlsx', index=False)
|
|
|
|
|
print("词频统计已保存")
|
|
|
|
|
|
|
|
|
|
def get_ai_danmu(content):
|
|
|
|
|
"""筛选出与AI相关的弹幕内容。
|
|
|
|
|
Args:
|
|
|
|
|
content (list): 弹幕内容列表。
|
|
|
|
|
"""
|
|
|
|
|
ai_danmu = [word for word in content if any(keyword in word for keyword in ai_keyword_list)]
|
|
|
|
|
df = pd.DataFrame({'AI弹幕内容': ai_danmu})
|
|
|
|
|
df.to_excel('AI弹幕.xlsx', index=False)
|
|
|
|
|
print("AI弹幕已保存")
|
|
|
|
|
|
|
|
|
|
def load_stopwords(path):
|
|
|
|
|
"""从文件中加载停用词。
|
|
|
|
|
Args:
|
|
|
|
|
path (str): 停用词文件的路径。
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
set: 停用词集合。
|
|
|
|
|
"""
|
|
|
|
|
try:
|
|
|
|
|
with open(path, 'r', encoding='utf-8') as f:
|
|
|
|
|
return set(line.strip() for line in f)
|
|
|
|
|
except FileNotFoundError:
|
|
|
|
|
print("停用词文件未找到")
|
|
|
|
|
return set()
|
|
|
|
|
|
|
|
|
|
def generate_word_cloud(text, stopwords_path):
|
|
|
|
|
"""根据弹幕内容生成词云。
|
|
|
|
|
Args:
|
|
|
|
|
text (str): 弹幕文本内容。
|
|
|
|
|
stopwords_path (str): 停用词文件的路径。
|
|
|
|
|
"""
|
|
|
|
|
stopwords = load_stopwords(stopwords_path)
|
|
|
|
|
words = jieba.lcut(text)
|
|
|
|
|
c = Counter(word for word in words if len(word) > 1 and word not in stopwords)
|
|
|
|
|
word_list = c.most_common(200)
|
|
|
|
|
|
|
|
|
|
image = imageio.v2.imread(r'C:\Users\ly130\Desktop\Homework1\mask.png')
|
|
|
|
|
wc = wordcloud.WordCloud(width=image.shape[0], height=image.shape[1], background_color='white',
|
|
|
|
|
font_path='msyh.ttc', mask=image, scale=3)
|
|
|
|
|
wc.generate_from_frequencies(dict(word_list))
|
|
|
|
|
wc.to_file(r'C:\Users\ly130\Desktop\Homework1\word_cloud123.png')
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
bv_list = get_bv(video_num)
|
|
|
|
|
oid_list = get_oid(bv_list)
|
|
|
|
|
content_new = get_danmu(oid_list)
|
|
|
|
|
save_danmu(content_new)
|
|
|
|
|
top_eight_words(content_new)
|
|
|
|
|
get_ai_danmu(content_new)
|
|
|
|
|
|
|
|
|
|
with open('弹幕.txt', 'r', encoding='utf-8') as f:
|
|
|
|
|
text = f.read()
|
|
|
|
|
generate_word_cloud(text, stopwords_path)
|
|
|
|
|
|