You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
danmuCrawler/弹幕-爬虫-数据处理-词云图.py

191 lines
10 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import requests
import re
import pandas as pds
import time
import jieba
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from PIL import Image
from bs4 import BeautifulSoup
from collections import Counter
from openpyxl import Workbook
# 定义全局变量
bvid_list = [] # 存储BV号的列表
oid_list = [] # 存储OID号的列表
content_list = [] # 存储弹幕内容的列表
s = set([]) # 存储不重复的BV号集合
count = 300 # 爬取视频数量
pages = 15 # 爬取网页页数
#爬取bv号
def bvid_get(page,count):
try:
for i in range(1,page): # 爬取指定页数
url = f'https://search.bilibili.com/video?keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=3&page={i}'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
"cookie": "buvid4=B85AEC97-F69C-C637-E3C2-C663D774232B87096-022081317-Mk4wjKcJQ45KiYrEq7vrds65fs3pUAqkDfUodWgIWmefZ%2B8Gkn9QRg%3D%3D; CURRENT_FNVAL=4048; buvid_fp=ae1a9ed67ce4b4b81601b1428b3217f9; rpdid=|(k|lk~umYk)0J'u~um~Y|lku; SESSDATA=19c9c747%2C1735394649%2Cde0b5%2A72CjCGWwVQmSRSWbmZZg7l2IVicrrImP6KbqADlDdybFU35A2pNwxPiQvGuSWVPFCvg4sSVjNCOVZTU3ZjeXhGcTYwc1RZSWo1U2E2clc2WkJwdE0zREh2SnRnaldDcGhDUTZOVnF2TVdIWVZvVXBuNnJGeWs5MlhvNFdraGZpaGFwbVNvSXlWSERRIIEC; bili_jct=b7b181986a9d1a649b30eb2fa5929d52; DedeUserID=1604567014; DedeUserID__ckMd5=e19ded6239c7cd47; buvid3=3D25DBF7-1426-D2BB-E102-DB9D3B6CE68458040infoc; b_nut=1719933858; enable_web_push=DISABLE; header_theme_version=CLOSE; LIVE_BUVID=AUTO8417216581143780; fingerprint=4a6a977133f772625df973e6fe4b955c; bp_video_offset_1604567014=968870267503771648; hit-dyn-v2=1; CURRENT_QUALITY=80; _uuid=DEA107613-17109-A8AA-9E27-E8BA6FBFC103E89699infoc; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjU3OTUyODcsImlhdCI6MTcyNTUzNjAyNywicGx0IjotMX0.4xQbg9vPDx7yeSRK4xst2CzX32eMjPSMdlfFyeUn6_g; bili_ticket_expires=1725795227; PVID=8; bp_t_offset_1604567014=974079272559837184; b_lsid=351B10593_191C81CD95D; home_feed_column=4; browser_resolution=978-740; sid=6zbnnd1f"
}
response = requests.get(url=url, headers=headers)
response.encoding = 'utf-8'
#print(response.text)
temp=re.findall(r'bvid:"(.*?)"',response.text) #存入temp列表
#print(temp)
for i in temp:
s.add(i)
if len(bvid_list)>=count: # 爬取指定数量的视频
break
if (len(s)>=count):
bvid_list.extend(list(s))
break
#print(bvid_list)
print("爬取",len(bvid_list),"个视频的bvid")
except requests.RequestException as e:
print(e)
#爬取视频oid号
def oid_get():
try:
for i in bvid_list:
url = f'https://api.bilibili.com/x/player/pagelist?bvid={i}&jsonp=jsonp'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
"cookie": "buvid4=B85AEC97-F69C-C637-E3C2-C663D774232B87096-022081317-Mk4wjKcJQ45KiYrEq7vrds65fs3pUAqkDfUodWgIWmefZ%2B8Gkn9QRg%3D%3D; CURRENT_FNVAL=4048; buvid_fp=ae1a9ed67ce4b4b81601b1428b3217f9; rpdid=|(k|lk~umYk)0J'u~um~Y|lku; SESSDATA=19c9c747%2C1735394649%2Cde0b5%2A72CjCGWwVQmSRSWbmZZg7l2IVicrrImP6KbqADlDdybFU35A2pNwxPiQvGuSWVPFCvg4sSVjNCOVZTU3ZjeXhGcTYwc1RZSWo1U2E2clc2WkJwdE0zREh2SnRnaldDcGhDUTZOVnF2TVdIWVZvVXBuNnJGeWs5MlhvNFdraGZpaGFwbVNvSXlWSERRIIEC; bili_jct=b7b181986a9d1a649b30eb2fa5929d52; DedeUserID=1604567014; DedeUserID__ckMd5=e19ded6239c7cd47; buvid3=3D25DBF7-1426-D2BB-E102-DB9D3B6CE68458040infoc; b_nut=1719933858; enable_web_push=DISABLE; header_theme_version=CLOSE; LIVE_BUVID=AUTO8417216581143780; fingerprint=4a6a977133f772625df973e6fe4b955c; bp_video_offset_1604567014=968870267503771648; hit-dyn-v2=1; CURRENT_QUALITY=80; _uuid=DEA107613-17109-A8AA-9E27-E8BA6FBFC103E89699infoc; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjU3OTUyODcsImlhdCI6MTcyNTUzNjAyNywicGx0IjotMX0.4xQbg9vPDx7yeSRK4xst2CzX32eMjPSMdlfFyeUn6_g; bili_ticket_expires=1725795227; PVID=8; bp_t_offset_1604567014=974079272559837184; b_lsid=351B10593_191C81CD95D; home_feed_column=4; browser_resolution=978-740; sid=6zbnnd1f"
}
response = requests.get(url=url, headers=headers)
response.encoding = 'utf-8'
#print(response.text)
oid_list.extend(re.findall(r'"cid":(.*?),',response.text)) #存入oid_list列表
#print(oid_list)
#print('爬取',len(oid_list),'个视频的oid')
except requests.RequestException as e:
print(e)
#爬取弹幕
def danmu_get():
try:
for i in oid_list:
#print(i)
url = f'https://api.bilibili.com/x/v1/dm/list.so?oid={i}'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
"cookie": "buvid4=B85AEC97-F69C-C637-E3C2-C663D774232B87096-022081317-Mk4wjKcJQ45KiYrEq7vrds65fs3pUAqkDfUodWgIWmefZ%2B8Gkn9QRg%3D%3D; CURRENT_FNVAL=4048; buvid_fp=ae1a9ed67ce4b4b81601b1428b3217f9; rpdid=|(k|lk~umYk)0J'u~um~Y|lku; SESSDATA=19c9c747%2C1735394649%2Cde0b5%2A72CjCGWwVQmSRSWbmZZg7l2IVicrrImP6KbqADlDdybFU35A2pNwxPiQvGuSWVPFCvg4sSVjNCOVZTU3ZjeXhGcTYwc1RZSWo1U2E2clc2WkJwdE0zREh2SnRnaldDcGhDUTZOVnF2TVdIWVZvVXBuNnJGeWs5MlhvNFdraGZpaGFwbVNvSXlWSERRIIEC; bili_jct=b7b181986a9d1a649b30eb2fa5929d52; DedeUserID=1604567014; DedeUserID__ckMd5=e19ded6239c7cd47; buvid3=3D25DBF7-1426-D2BB-E102-DB9D3B6CE68458040infoc; b_nut=1719933858; enable_web_push=DISABLE; header_theme_version=CLOSE; LIVE_BUVID=AUTO8417216581143780; fingerprint=4a6a977133f772625df973e6fe4b955c; bp_video_offset_1604567014=968870267503771648; hit-dyn-v2=1; CURRENT_QUALITY=80; _uuid=DEA107613-17109-A8AA-9E27-E8BA6FBFC103E89699infoc; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjU3OTUyODcsImlhdCI6MTcyNTUzNjAyNywicGx0IjotMX0.4xQbg9vPDx7yeSRK4xst2CzX32eMjPSMdlfFyeUn6_g; bili_ticket_expires=1725795227; PVID=8; bp_t_offset_1604567014=974079272559837184; b_lsid=351B10593_191C81CD95D; home_feed_column=4; browser_resolution=978-740; sid=6zbnnd1f"
}
response = requests.get(url=url, headers=headers)
response.encoding = 'utf-8'
#print(response.text)
for j in re.findall(r'<d p=".*?">(.*?)</d>',response.text): #存入content_list列表
content_list.append(j)
#print(content_list)
time.sleep(0.5)
print("弹幕数量",len(content_list))
except requests.RequestException as e:
print(e)
#生成弹幕txt
def create_txt():
for content in content_list:
with open('弹幕567.txt', mode='a', encoding='utf-8') as f:
f.write(content)
f.write('\n')
#print(content)
#生成弹幕excel
def create_excel():
writeData = {
'弹幕': content_list
}
fwrite = pds.DataFrame(writeData)
fwrite.to_excel("./弹幕567.xlsx", index=False)
#寻找AI相关弹幕以及前8弹幕
def AI_danmu_and_top_eight_danmu():
# 设置文件路径和表名
file_path = '弹幕567.xlsx'
sheet_name = 'Sheet1'
# 定义要搜索的关键词
keyword = 'AI'
# 读取Excel文件
df = pds.read_excel(file_path, sheet_name=sheet_name)
# 筛选包含关键词的行
filtered_df = df[df['弹幕'].str.contains(keyword, na=False, case=False)]
# 计算每种内容的出现次数(不去重,直接计数)
content_counts = filtered_df['弹幕'].value_counts()
# 将结果转换为DataFrame并重置索引
content_counts_df = content_counts.reset_index()
content_counts_df.columns = ['Text', 'Total_Count']
# 写入新的Excel文件
output_file_path = 'AI相关弹幕.xlsx'
content_counts_df.to_excel(output_file_path, index=False)
print(f"包含关键词'{keyword}'的内容及其总数量已写入到'{output_file_path}'")
# 对出现次数进行排序,并选择前八个
top_8_counts = content_counts.nlargest(8)
# 将结果转换为DataFrame并重置索引
content_counts_df1 = top_8_counts.reset_index()
content_counts_df1.columns = ['Text', 'Total_Count']
# 写入新的Excel文件
output_file_path = 'AI相关弹幕_前8.xlsx'
content_counts_df1.to_excel(output_file_path, index=False)
print(f"包含关键词'{keyword}'的内容及其前八个总数量已写入到'{output_file_path}'")
#词云图
def create_wordcloud():
# 需要排除的关键词列表
exclude_words = set(['', '', '','', '', '', '', '', '', '', '', '', '', '', '这个','不是','真的','我们', '你们', '他们'])
#mask = Image.open(r"D:\b站弹幕爬虫\mask.png").convert("L")
# 读取文件内容
with open('弹幕567.txt', encoding='utf-8') as f:
txt = f.read()
# 分词并排除指定关键词
words = jieba.lcut(txt)
filtered_words = [word for word in words if word not in exclude_words and len(word) > 1] # 排除太短或指定的词
string = ' '.join(filtered_words)
# 创建词云对象
wc = WordCloud(
width=1000,
height=700,
background_color='white',
colormap='viridis',
#mask=mask,
font_path='msyh.ttc'
)
# 生成词云
wc.generate(string)
plt.imshow(wc, interpolation='bilinear')
plt.axis('off') # 不显示坐标轴
plt.show()
# 保存词云到文件
wc.to_file('词云567.png')
if __name__ == '__main__':
bvid_get(pages,count)
oid_get()
danmu_get()
create_txt()
create_excel()
AI_danmu_and_top_eight_danmu()
create_wordcloud()