Delete '弹幕-爬虫-数据处理-词云图.py'

main
p7zwxau5j 2 months ago
parent 41b38ac718
commit 146555172e

@ -1,200 +0,0 @@
import requests
import re
import pandas as pds
import time
import jieba
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from PIL import Image
from bs4 import BeautifulSoup
from collections import Counter
from openpyxl import Workbook
# 定义全局变量
bvid_list = [] # 存储BV号的列表
oid_list = [] # 存储OID号的列表
content_list = [] # 存储弹幕内容的列表
s = set([]) # 存储不重复的BV号集合
count = 300 # 爬取视频数量
pages = 15 # 爬取网页页数
#爬取bv号
def bvid_get(page,count):
try:
for i in range(1,page): # 爬取指定页数
url = f'https://search.bilibili.com/video?keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=3&page={i}'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
"cookie": "buvid4=B85AEC97-F69C-C637-E3C2-C663D774232B87096-022081317-Mk4wjKcJQ45KiYrEq7vrds65fs3pUAqkDfUodWgIWmefZ%2B8Gkn9QRg%3D%3D; CURRENT_FNVAL=4048; buvid_fp=ae1a9ed67ce4b4b81601b1428b3217f9; rpdid=|(k|lk~umYk)0J'u~um~Y|lku; SESSDATA=19c9c747%2C1735394649%2Cde0b5%2A72CjCGWwVQmSRSWbmZZg7l2IVicrrImP6KbqADlDdybFU35A2pNwxPiQvGuSWVPFCvg4sSVjNCOVZTU3ZjeXhGcTYwc1RZSWo1U2E2clc2WkJwdE0zREh2SnRnaldDcGhDUTZOVnF2TVdIWVZvVXBuNnJGeWs5MlhvNFdraGZpaGFwbVNvSXlWSERRIIEC; bili_jct=b7b181986a9d1a649b30eb2fa5929d52; DedeUserID=1604567014; DedeUserID__ckMd5=e19ded6239c7cd47; buvid3=3D25DBF7-1426-D2BB-E102-DB9D3B6CE68458040infoc; b_nut=1719933858; enable_web_push=DISABLE; header_theme_version=CLOSE; LIVE_BUVID=AUTO8417216581143780; fingerprint=4a6a977133f772625df973e6fe4b955c; bp_video_offset_1604567014=968870267503771648; hit-dyn-v2=1; CURRENT_QUALITY=80; _uuid=DEA107613-17109-A8AA-9E27-E8BA6FBFC103E89699infoc; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjU3OTUyODcsImlhdCI6MTcyNTUzNjAyNywicGx0IjotMX0.4xQbg9vPDx7yeSRK4xst2CzX32eMjPSMdlfFyeUn6_g; bili_ticket_expires=1725795227; PVID=8; bp_t_offset_1604567014=974079272559837184; b_lsid=351B10593_191C81CD95D; home_feed_column=4; browser_resolution=978-740; sid=6zbnnd1f"
}
response = requests.get(url=url, headers=headers)
response.encoding = 'utf-8'
#print(response.text)
temp=re.findall(r'bvid:"(.*?)"',response.text)
#print(temp)
for i in temp:
s.add(i)
if len(bvid_list)>=count:
break
if (len(s)>=count):
bvid_list.extend(list(s))
break
#print(bvid_list)
print("爬取",len(bvid_list),"个视频的bvid")
except requests.RequestException as e:
print(e)
#爬取视频oid号
def oid_get():
try:
for i in bvid_list:
url = f'https://api.bilibili.com/x/player/pagelist?bvid={i}&jsonp=jsonp'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
"cookie": "buvid4=B85AEC97-F69C-C637-E3C2-C663D774232B87096-022081317-Mk4wjKcJQ45KiYrEq7vrds65fs3pUAqkDfUodWgIWmefZ%2B8Gkn9QRg%3D%3D; CURRENT_FNVAL=4048; buvid_fp=ae1a9ed67ce4b4b81601b1428b3217f9; rpdid=|(k|lk~umYk)0J'u~um~Y|lku; SESSDATA=19c9c747%2C1735394649%2Cde0b5%2A72CjCGWwVQmSRSWbmZZg7l2IVicrrImP6KbqADlDdybFU35A2pNwxPiQvGuSWVPFCvg4sSVjNCOVZTU3ZjeXhGcTYwc1RZSWo1U2E2clc2WkJwdE0zREh2SnRnaldDcGhDUTZOVnF2TVdIWVZvVXBuNnJGeWs5MlhvNFdraGZpaGFwbVNvSXlWSERRIIEC; bili_jct=b7b181986a9d1a649b30eb2fa5929d52; DedeUserID=1604567014; DedeUserID__ckMd5=e19ded6239c7cd47; buvid3=3D25DBF7-1426-D2BB-E102-DB9D3B6CE68458040infoc; b_nut=1719933858; enable_web_push=DISABLE; header_theme_version=CLOSE; LIVE_BUVID=AUTO8417216581143780; fingerprint=4a6a977133f772625df973e6fe4b955c; bp_video_offset_1604567014=968870267503771648; hit-dyn-v2=1; CURRENT_QUALITY=80; _uuid=DEA107613-17109-A8AA-9E27-E8BA6FBFC103E89699infoc; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjU3OTUyODcsImlhdCI6MTcyNTUzNjAyNywicGx0IjotMX0.4xQbg9vPDx7yeSRK4xst2CzX32eMjPSMdlfFyeUn6_g; bili_ticket_expires=1725795227; PVID=8; bp_t_offset_1604567014=974079272559837184; b_lsid=351B10593_191C81CD95D; home_feed_column=4; browser_resolution=978-740; sid=6zbnnd1f"
}
response = requests.get(url=url, headers=headers)
response.encoding = 'utf-8'
#print(response.text)
oid_list.extend(re.findall(r'"cid":(.*?),',response.text))
#print(oid_list)
#print('爬取',len(oid_list),'个视频的oid')
except requests.RequestException as e:
print(e)
#爬取弹幕
def danmu_get():
try:
for i in oid_list:
#print(i)
url = f'https://api.bilibili.com/x/v1/dm/list.so?oid={i}'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
"cookie": "buvid4=B85AEC97-F69C-C637-E3C2-C663D774232B87096-022081317-Mk4wjKcJQ45KiYrEq7vrds65fs3pUAqkDfUodWgIWmefZ%2B8Gkn9QRg%3D%3D; CURRENT_FNVAL=4048; buvid_fp=ae1a9ed67ce4b4b81601b1428b3217f9; rpdid=|(k|lk~umYk)0J'u~um~Y|lku; SESSDATA=19c9c747%2C1735394649%2Cde0b5%2A72CjCGWwVQmSRSWbmZZg7l2IVicrrImP6KbqADlDdybFU35A2pNwxPiQvGuSWVPFCvg4sSVjNCOVZTU3ZjeXhGcTYwc1RZSWo1U2E2clc2WkJwdE0zREh2SnRnaldDcGhDUTZOVnF2TVdIWVZvVXBuNnJGeWs5MlhvNFdraGZpaGFwbVNvSXlWSERRIIEC; bili_jct=b7b181986a9d1a649b30eb2fa5929d52; DedeUserID=1604567014; DedeUserID__ckMd5=e19ded6239c7cd47; buvid3=3D25DBF7-1426-D2BB-E102-DB9D3B6CE68458040infoc; b_nut=1719933858; enable_web_push=DISABLE; header_theme_version=CLOSE; LIVE_BUVID=AUTO8417216581143780; fingerprint=4a6a977133f772625df973e6fe4b955c; bp_video_offset_1604567014=968870267503771648; hit-dyn-v2=1; CURRENT_QUALITY=80; _uuid=DEA107613-17109-A8AA-9E27-E8BA6FBFC103E89699infoc; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjU3OTUyODcsImlhdCI6MTcyNTUzNjAyNywicGx0IjotMX0.4xQbg9vPDx7yeSRK4xst2CzX32eMjPSMdlfFyeUn6_g; bili_ticket_expires=1725795227; PVID=8; bp_t_offset_1604567014=974079272559837184; b_lsid=351B10593_191C81CD95D; home_feed_column=4; browser_resolution=978-740; sid=6zbnnd1f"
}
response = requests.get(url=url, headers=headers)
response.encoding = 'utf-8'
#print(response.text)
for j in re.findall(r'<d p=".*?">(.*?)</d>',response.text):
content_list.append(j)
#print(content_list)
time.sleep(0.5)
print("弹幕数量",len(content_list))
except requests.RequestException as e:
print(e)
#生成弹幕txt
def create_txt():
for content in content_list:
with open('弹幕567.txt', mode='a', encoding='utf-8') as f:
f.write(content)
f.write('\n')
#print(content)
#生成弹幕excel
def create_excel():
writeData = {
'弹幕': content_list
}
fwrite = pds.DataFrame(writeData)
fwrite.to_excel("./弹幕567.xlsx", index=False)
#寻找前8弹幕
def top_eight_danmu():
counter = Counter(content_list)
# 获取频率前八的内容
top_eight = counter.most_common(8)
# 创建一个新的Excel工作簿和工作表
wb = Workbook()
ws = wb.active
# 写入标题行
ws.append(["Item", "Count"])
# 遍历并写入频率前八的项
for item, count in top_eight:
ws.append([item, count])
# 保存工作簿到文件
wb.save("top_eight_danmu567.xlsx")
#查找AI相关弹幕
def search_AI_danmu():
file_path = '弹幕567.xlsx'
sheet_name = 'Sheet1'
df = pds.read_excel(file_path, sheet_name=sheet_name)
# 定义要搜索的关键词
keyword = 'ai'
# 筛选出包含关键词的行
df = pds.read_excel(file_path, sheet_name=sheet_name)
# 筛选包含关键词的行
filtered_df = df[df['弹幕'].str.contains(keyword, na=False, case=False)]
# 计算每种内容的出现次数(不去重,直接计数)
content_counts = filtered_df['弹幕'].value_counts()
# 将结果转换为DataFrame并重置索引
content_counts_df = content_counts.reset_index()
content_counts_df.columns = ['Text', 'Total_Count']
# 写入新的Excel文件
output_file_path = 'AI相关弹幕.xlsx'
content_counts_df.to_excel(output_file_path, index=False)
print(f"包含关键词'{keyword}'的内容及其总数量已写入到'{output_file_path}'")
#词云图
def create_wordcloud():
# 需要排除的关键词列表
exclude_words = set(['', '', '','', '', '', '', '', '', '', '', '', '', '', '这个','不是','真的','我们', '你们', '他们'])
#mask = Image.open(r"D:\b站弹幕爬虫\mask.png").convert("L")
# 读取文件内容
with open('弹幕567.txt', encoding='utf-8') as f:
txt = f.read()
# 分词并排除指定关键词
words = jieba.lcut(txt)
filtered_words = [word for word in words if word not in exclude_words and len(word) > 1] # 排除太短或指定的词
string = ' '.join(filtered_words)
# 创建词云对象
wc = WordCloud(
width=1000,
height=700,
background_color='white',
colormap='viridis',
#mask=mask,
font_path='msyh.ttc'
)
# 生成词云
wc.generate(string)
plt.imshow(wc, interpolation='bilinear')
plt.axis('off') # 不显示坐标轴
plt.show()
# 保存词云到文件
wc.to_file('词云567.png')
if __name__ == '__main__':
bvid_get(pages,count)
oid_get()
danmu_get()
create_txt()
create_excel()
top_eight_danmu()
search_AI_danmu()
create_wordcloud()
Loading…
Cancel
Save