|
|
|
|
import requests
|
|
|
|
|
import re
|
|
|
|
|
import pandas as pds
|
|
|
|
|
import time
|
|
|
|
|
import jieba
|
|
|
|
|
import matplotlib.pyplot as plt
|
|
|
|
|
from wordcloud import WordCloud
|
|
|
|
|
from PIL import Image
|
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
from collections import Counter
|
|
|
|
|
from openpyxl import Workbook
|
|
|
|
|
|
|
|
|
|
# 定义全局变量
|
|
|
|
|
bvid_list = [] # 存储BV号的列表
|
|
|
|
|
oid_list = [] # 存储OID号的列表
|
|
|
|
|
content_list = [] # 存储弹幕内容的列表
|
|
|
|
|
s = set([]) # 存储不重复的BV号集合
|
|
|
|
|
count = 300 # 爬取视频数量
|
|
|
|
|
pages = 15 # 爬取网页页数
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#爬取bv号
|
|
|
|
|
def bvid_get(page,count):
|
|
|
|
|
try:
|
|
|
|
|
for i in range(1,page): # 爬取指定页数
|
|
|
|
|
url = f'https://search.bilibili.com/video?keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=3&page={i}'
|
|
|
|
|
headers = {
|
|
|
|
|
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
|
|
|
"cookie": "buvid4=B85AEC97-F69C-C637-E3C2-C663D774232B87096-022081317-Mk4wjKcJQ45KiYrEq7vrds65fs3pUAqkDfUodWgIWmefZ%2B8Gkn9QRg%3D%3D; CURRENT_FNVAL=4048; buvid_fp=ae1a9ed67ce4b4b81601b1428b3217f9; rpdid=|(k|lk~umYk)0J'u~um~Y|lku; SESSDATA=19c9c747%2C1735394649%2Cde0b5%2A72CjCGWwVQmSRSWbmZZg7l2IVicrrImP6KbqADlDdybFU35A2pNwxPiQvGuSWVPFCvg4sSVjNCOVZTU3ZjeXhGcTYwc1RZSWo1U2E2clc2WkJwdE0zREh2SnRnaldDcGhDUTZOVnF2TVdIWVZvVXBuNnJGeWs5MlhvNFdraGZpaGFwbVNvSXlWSERRIIEC; bili_jct=b7b181986a9d1a649b30eb2fa5929d52; DedeUserID=1604567014; DedeUserID__ckMd5=e19ded6239c7cd47; buvid3=3D25DBF7-1426-D2BB-E102-DB9D3B6CE68458040infoc; b_nut=1719933858; enable_web_push=DISABLE; header_theme_version=CLOSE; LIVE_BUVID=AUTO8417216581143780; fingerprint=4a6a977133f772625df973e6fe4b955c; bp_video_offset_1604567014=968870267503771648; hit-dyn-v2=1; CURRENT_QUALITY=80; _uuid=DEA107613-17109-A8AA-9E27-E8BA6FBFC103E89699infoc; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjU3OTUyODcsImlhdCI6MTcyNTUzNjAyNywicGx0IjotMX0.4xQbg9vPDx7yeSRK4xst2CzX32eMjPSMdlfFyeUn6_g; bili_ticket_expires=1725795227; PVID=8; bp_t_offset_1604567014=974079272559837184; b_lsid=351B10593_191C81CD95D; home_feed_column=4; browser_resolution=978-740; sid=6zbnnd1f"
|
|
|
|
|
}
|
|
|
|
|
response = requests.get(url=url, headers=headers)
|
|
|
|
|
response.encoding = 'utf-8'
|
|
|
|
|
#print(response.text)
|
|
|
|
|
temp=re.findall(r'bvid:"(.*?)"',response.text) #存入temp列表
|
|
|
|
|
#print(temp)
|
|
|
|
|
for i in temp:
|
|
|
|
|
s.add(i)
|
|
|
|
|
if len(bvid_list)>=count: # 爬取指定数量的视频
|
|
|
|
|
break
|
|
|
|
|
if (len(s)>=count):
|
|
|
|
|
bvid_list.extend(list(s))
|
|
|
|
|
break
|
|
|
|
|
#print(bvid_list)
|
|
|
|
|
print("爬取",len(bvid_list),"个视频的bvid")
|
|
|
|
|
except requests.RequestException as e:
|
|
|
|
|
print(e)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#爬取视频oid号
|
|
|
|
|
def oid_get():
|
|
|
|
|
try:
|
|
|
|
|
for i in bvid_list:
|
|
|
|
|
url = f'https://api.bilibili.com/x/player/pagelist?bvid={i}&jsonp=jsonp'
|
|
|
|
|
headers = {
|
|
|
|
|
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
|
|
|
"cookie": "buvid4=B85AEC97-F69C-C637-E3C2-C663D774232B87096-022081317-Mk4wjKcJQ45KiYrEq7vrds65fs3pUAqkDfUodWgIWmefZ%2B8Gkn9QRg%3D%3D; CURRENT_FNVAL=4048; buvid_fp=ae1a9ed67ce4b4b81601b1428b3217f9; rpdid=|(k|lk~umYk)0J'u~um~Y|lku; SESSDATA=19c9c747%2C1735394649%2Cde0b5%2A72CjCGWwVQmSRSWbmZZg7l2IVicrrImP6KbqADlDdybFU35A2pNwxPiQvGuSWVPFCvg4sSVjNCOVZTU3ZjeXhGcTYwc1RZSWo1U2E2clc2WkJwdE0zREh2SnRnaldDcGhDUTZOVnF2TVdIWVZvVXBuNnJGeWs5MlhvNFdraGZpaGFwbVNvSXlWSERRIIEC; bili_jct=b7b181986a9d1a649b30eb2fa5929d52; DedeUserID=1604567014; DedeUserID__ckMd5=e19ded6239c7cd47; buvid3=3D25DBF7-1426-D2BB-E102-DB9D3B6CE68458040infoc; b_nut=1719933858; enable_web_push=DISABLE; header_theme_version=CLOSE; LIVE_BUVID=AUTO8417216581143780; fingerprint=4a6a977133f772625df973e6fe4b955c; bp_video_offset_1604567014=968870267503771648; hit-dyn-v2=1; CURRENT_QUALITY=80; _uuid=DEA107613-17109-A8AA-9E27-E8BA6FBFC103E89699infoc; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjU3OTUyODcsImlhdCI6MTcyNTUzNjAyNywicGx0IjotMX0.4xQbg9vPDx7yeSRK4xst2CzX32eMjPSMdlfFyeUn6_g; bili_ticket_expires=1725795227; PVID=8; bp_t_offset_1604567014=974079272559837184; b_lsid=351B10593_191C81CD95D; home_feed_column=4; browser_resolution=978-740; sid=6zbnnd1f"
|
|
|
|
|
}
|
|
|
|
|
response = requests.get(url=url, headers=headers)
|
|
|
|
|
response.encoding = 'utf-8'
|
|
|
|
|
#print(response.text)
|
|
|
|
|
oid_list.extend(re.findall(r'"cid":(.*?),',response.text)) #存入oid_list列表
|
|
|
|
|
#print(oid_list)
|
|
|
|
|
#print('爬取',len(oid_list),'个视频的oid')
|
|
|
|
|
except requests.RequestException as e:
|
|
|
|
|
print(e)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#爬取弹幕
|
|
|
|
|
def danmu_get():
|
|
|
|
|
try:
|
|
|
|
|
for i in oid_list:
|
|
|
|
|
#print(i)
|
|
|
|
|
url = f'https://api.bilibili.com/x/v1/dm/list.so?oid={i}'
|
|
|
|
|
headers = {
|
|
|
|
|
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
|
|
|
"cookie": "buvid4=B85AEC97-F69C-C637-E3C2-C663D774232B87096-022081317-Mk4wjKcJQ45KiYrEq7vrds65fs3pUAqkDfUodWgIWmefZ%2B8Gkn9QRg%3D%3D; CURRENT_FNVAL=4048; buvid_fp=ae1a9ed67ce4b4b81601b1428b3217f9; rpdid=|(k|lk~umYk)0J'u~um~Y|lku; SESSDATA=19c9c747%2C1735394649%2Cde0b5%2A72CjCGWwVQmSRSWbmZZg7l2IVicrrImP6KbqADlDdybFU35A2pNwxPiQvGuSWVPFCvg4sSVjNCOVZTU3ZjeXhGcTYwc1RZSWo1U2E2clc2WkJwdE0zREh2SnRnaldDcGhDUTZOVnF2TVdIWVZvVXBuNnJGeWs5MlhvNFdraGZpaGFwbVNvSXlWSERRIIEC; bili_jct=b7b181986a9d1a649b30eb2fa5929d52; DedeUserID=1604567014; DedeUserID__ckMd5=e19ded6239c7cd47; buvid3=3D25DBF7-1426-D2BB-E102-DB9D3B6CE68458040infoc; b_nut=1719933858; enable_web_push=DISABLE; header_theme_version=CLOSE; LIVE_BUVID=AUTO8417216581143780; fingerprint=4a6a977133f772625df973e6fe4b955c; bp_video_offset_1604567014=968870267503771648; hit-dyn-v2=1; CURRENT_QUALITY=80; _uuid=DEA107613-17109-A8AA-9E27-E8BA6FBFC103E89699infoc; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjU3OTUyODcsImlhdCI6MTcyNTUzNjAyNywicGx0IjotMX0.4xQbg9vPDx7yeSRK4xst2CzX32eMjPSMdlfFyeUn6_g; bili_ticket_expires=1725795227; PVID=8; bp_t_offset_1604567014=974079272559837184; b_lsid=351B10593_191C81CD95D; home_feed_column=4; browser_resolution=978-740; sid=6zbnnd1f"
|
|
|
|
|
}
|
|
|
|
|
response = requests.get(url=url, headers=headers)
|
|
|
|
|
response.encoding = 'utf-8'
|
|
|
|
|
#print(response.text)
|
|
|
|
|
for j in re.findall(r'<d p=".*?">(.*?)</d>',response.text): #存入content_list列表
|
|
|
|
|
content_list.append(j)
|
|
|
|
|
#print(content_list)
|
|
|
|
|
|
|
|
|
|
time.sleep(0.5)
|
|
|
|
|
|
|
|
|
|
print("弹幕数量",len(content_list))
|
|
|
|
|
except requests.RequestException as e:
|
|
|
|
|
print(e)
|
|
|
|
|
|
|
|
|
|
#生成弹幕txt
|
|
|
|
|
def create_txt():
|
|
|
|
|
for content in content_list:
|
|
|
|
|
with open('弹幕567.txt', mode='a', encoding='utf-8') as f:
|
|
|
|
|
f.write(content)
|
|
|
|
|
f.write('\n')
|
|
|
|
|
#print(content)
|
|
|
|
|
|
|
|
|
|
#生成弹幕excel
|
|
|
|
|
def create_excel():
|
|
|
|
|
writeData = {
|
|
|
|
|
'弹幕': content_list
|
|
|
|
|
}
|
|
|
|
|
fwrite = pds.DataFrame(writeData)
|
|
|
|
|
fwrite.to_excel("./弹幕567.xlsx", index=False)
|
|
|
|
|
|
|
|
|
|
#寻找AI相关弹幕以及前8弹幕
|
|
|
|
|
def AI_danmu_and_top_eight_danmu():
|
|
|
|
|
# 设置文件路径和表名
|
|
|
|
|
file_path = '弹幕567.xlsx'
|
|
|
|
|
sheet_name = 'Sheet1'
|
|
|
|
|
|
|
|
|
|
# 定义要搜索的关键词
|
|
|
|
|
keyword = 'AI'
|
|
|
|
|
|
|
|
|
|
# 读取Excel文件
|
|
|
|
|
df = pds.read_excel(file_path, sheet_name=sheet_name)
|
|
|
|
|
|
|
|
|
|
# 筛选包含关键词的行
|
|
|
|
|
filtered_df = df[df['弹幕'].str.contains(keyword, na=False, case=False)]
|
|
|
|
|
|
|
|
|
|
# 计算每种内容的出现次数(不去重,直接计数)
|
|
|
|
|
content_counts = filtered_df['弹幕'].value_counts()
|
|
|
|
|
|
|
|
|
|
# 将结果转换为DataFrame,并重置索引
|
|
|
|
|
content_counts_df = content_counts.reset_index()
|
|
|
|
|
content_counts_df.columns = ['Text', 'Total_Count']
|
|
|
|
|
|
|
|
|
|
# 写入新的Excel文件
|
|
|
|
|
output_file_path = 'AI相关弹幕.xlsx'
|
|
|
|
|
content_counts_df.to_excel(output_file_path, index=False)
|
|
|
|
|
|
|
|
|
|
print(f"包含关键词'{keyword}'的内容及其总数量已写入到'{output_file_path}'")
|
|
|
|
|
|
|
|
|
|
# 对出现次数进行排序,并选择前八个
|
|
|
|
|
top_8_counts = content_counts.nlargest(8)
|
|
|
|
|
|
|
|
|
|
# 将结果转换为DataFrame,并重置索引
|
|
|
|
|
content_counts_df1 = top_8_counts.reset_index()
|
|
|
|
|
content_counts_df1.columns = ['Text', 'Total_Count']
|
|
|
|
|
|
|
|
|
|
# 写入新的Excel文件
|
|
|
|
|
output_file_path = 'AI相关弹幕_前8.xlsx'
|
|
|
|
|
content_counts_df1.to_excel(output_file_path, index=False)
|
|
|
|
|
|
|
|
|
|
print(f"包含关键词'{keyword}'的内容及其前八个总数量已写入到'{output_file_path}'")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#词云图
|
|
|
|
|
def create_wordcloud():
|
|
|
|
|
# 需要排除的关键词列表
|
|
|
|
|
exclude_words = set(['的', '了', '啊','在', '是', '我', '你', '他', '她', '它', '有', '和', '这', '那', '这个','不是','真的','我们', '你们', '他们'])
|
|
|
|
|
#mask = Image.open(r"D:\b站弹幕爬虫\mask.png").convert("L")
|
|
|
|
|
# 读取文件内容
|
|
|
|
|
with open('弹幕567.txt', encoding='utf-8') as f:
|
|
|
|
|
txt = f.read()
|
|
|
|
|
|
|
|
|
|
# 分词并排除指定关键词
|
|
|
|
|
words = jieba.lcut(txt)
|
|
|
|
|
filtered_words = [word for word in words if word not in exclude_words and len(word) > 1] # 排除太短或指定的词
|
|
|
|
|
string = ' '.join(filtered_words)
|
|
|
|
|
|
|
|
|
|
# 创建词云对象
|
|
|
|
|
wc = WordCloud(
|
|
|
|
|
width=1000,
|
|
|
|
|
height=700,
|
|
|
|
|
background_color='white',
|
|
|
|
|
colormap='viridis',
|
|
|
|
|
#mask=mask,
|
|
|
|
|
font_path='msyh.ttc'
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# 生成词云
|
|
|
|
|
wc.generate(string)
|
|
|
|
|
plt.imshow(wc, interpolation='bilinear')
|
|
|
|
|
plt.axis('off') # 不显示坐标轴
|
|
|
|
|
plt.show()
|
|
|
|
|
# 保存词云到文件
|
|
|
|
|
wc.to_file('词云567.png')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
bvid_get(pages,count)
|
|
|
|
|
oid_get()
|
|
|
|
|
danmu_get()
|
|
|
|
|
create_txt()
|
|
|
|
|
create_excel()
|
|
|
|
|
AI_danmu_and_top_eight_danmu()
|
|
|
|
|
create_wordcloud()
|