ADD file via upload

main
p7zwxau5j 2 months ago
parent 146555172e
commit dc524bced7

@ -0,0 +1,191 @@
import requests
import re
import pandas as pds
import time
import jieba
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from PIL import Image
from bs4 import BeautifulSoup
from collections import Counter
from openpyxl import Workbook
# 定义全局变量
bvid_list = [] # 存储BV号的列表
oid_list = [] # 存储OID号的列表
content_list = [] # 存储弹幕内容的列表
s = set([]) # 存储不重复的BV号集合
count = 300 # 爬取视频数量
pages = 15 # 爬取网页页数
#爬取bv号
def bvid_get(page,count):
try:
for i in range(1,page): # 爬取指定页数
url = f'https://search.bilibili.com/video?keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=3&page={i}'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
"cookie": "buvid4=B85AEC97-F69C-C637-E3C2-C663D774232B87096-022081317-Mk4wjKcJQ45KiYrEq7vrds65fs3pUAqkDfUodWgIWmefZ%2B8Gkn9QRg%3D%3D; CURRENT_FNVAL=4048; buvid_fp=ae1a9ed67ce4b4b81601b1428b3217f9; rpdid=|(k|lk~umYk)0J'u~um~Y|lku; SESSDATA=19c9c747%2C1735394649%2Cde0b5%2A72CjCGWwVQmSRSWbmZZg7l2IVicrrImP6KbqADlDdybFU35A2pNwxPiQvGuSWVPFCvg4sSVjNCOVZTU3ZjeXhGcTYwc1RZSWo1U2E2clc2WkJwdE0zREh2SnRnaldDcGhDUTZOVnF2TVdIWVZvVXBuNnJGeWs5MlhvNFdraGZpaGFwbVNvSXlWSERRIIEC; bili_jct=b7b181986a9d1a649b30eb2fa5929d52; DedeUserID=1604567014; DedeUserID__ckMd5=e19ded6239c7cd47; buvid3=3D25DBF7-1426-D2BB-E102-DB9D3B6CE68458040infoc; b_nut=1719933858; enable_web_push=DISABLE; header_theme_version=CLOSE; LIVE_BUVID=AUTO8417216581143780; fingerprint=4a6a977133f772625df973e6fe4b955c; bp_video_offset_1604567014=968870267503771648; hit-dyn-v2=1; CURRENT_QUALITY=80; _uuid=DEA107613-17109-A8AA-9E27-E8BA6FBFC103E89699infoc; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjU3OTUyODcsImlhdCI6MTcyNTUzNjAyNywicGx0IjotMX0.4xQbg9vPDx7yeSRK4xst2CzX32eMjPSMdlfFyeUn6_g; bili_ticket_expires=1725795227; PVID=8; bp_t_offset_1604567014=974079272559837184; b_lsid=351B10593_191C81CD95D; home_feed_column=4; browser_resolution=978-740; sid=6zbnnd1f"
}
response = requests.get(url=url, headers=headers)
response.encoding = 'utf-8'
#print(response.text)
temp=re.findall(r'bvid:"(.*?)"',response.text)
#print(temp)
for i in temp:
s.add(i)
if len(bvid_list)>=count:
break
if (len(s)>=count):
bvid_list.extend(list(s))
break
#print(bvid_list)
print("爬取",len(bvid_list),"个视频的bvid")
except requests.RequestException as e:
print(e)
#爬取视频oid号
def oid_get():
try:
for i in bvid_list:
url = f'https://api.bilibili.com/x/player/pagelist?bvid={i}&jsonp=jsonp'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
"cookie": "buvid4=B85AEC97-F69C-C637-E3C2-C663D774232B87096-022081317-Mk4wjKcJQ45KiYrEq7vrds65fs3pUAqkDfUodWgIWmefZ%2B8Gkn9QRg%3D%3D; CURRENT_FNVAL=4048; buvid_fp=ae1a9ed67ce4b4b81601b1428b3217f9; rpdid=|(k|lk~umYk)0J'u~um~Y|lku; SESSDATA=19c9c747%2C1735394649%2Cde0b5%2A72CjCGWwVQmSRSWbmZZg7l2IVicrrImP6KbqADlDdybFU35A2pNwxPiQvGuSWVPFCvg4sSVjNCOVZTU3ZjeXhGcTYwc1RZSWo1U2E2clc2WkJwdE0zREh2SnRnaldDcGhDUTZOVnF2TVdIWVZvVXBuNnJGeWs5MlhvNFdraGZpaGFwbVNvSXlWSERRIIEC; bili_jct=b7b181986a9d1a649b30eb2fa5929d52; DedeUserID=1604567014; DedeUserID__ckMd5=e19ded6239c7cd47; buvid3=3D25DBF7-1426-D2BB-E102-DB9D3B6CE68458040infoc; b_nut=1719933858; enable_web_push=DISABLE; header_theme_version=CLOSE; LIVE_BUVID=AUTO8417216581143780; fingerprint=4a6a977133f772625df973e6fe4b955c; bp_video_offset_1604567014=968870267503771648; hit-dyn-v2=1; CURRENT_QUALITY=80; _uuid=DEA107613-17109-A8AA-9E27-E8BA6FBFC103E89699infoc; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjU3OTUyODcsImlhdCI6MTcyNTUzNjAyNywicGx0IjotMX0.4xQbg9vPDx7yeSRK4xst2CzX32eMjPSMdlfFyeUn6_g; bili_ticket_expires=1725795227; PVID=8; bp_t_offset_1604567014=974079272559837184; b_lsid=351B10593_191C81CD95D; home_feed_column=4; browser_resolution=978-740; sid=6zbnnd1f"
}
response = requests.get(url=url, headers=headers)
response.encoding = 'utf-8'
#print(response.text)
oid_list.extend(re.findall(r'"cid":(.*?),',response.text))
#print(oid_list)
#print('爬取',len(oid_list),'个视频的oid')
except requests.RequestException as e:
print(e)
#爬取弹幕
def danmu_get():
try:
for i in oid_list:
#print(i)
url = f'https://api.bilibili.com/x/v1/dm/list.so?oid={i}'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
"cookie": "buvid4=B85AEC97-F69C-C637-E3C2-C663D774232B87096-022081317-Mk4wjKcJQ45KiYrEq7vrds65fs3pUAqkDfUodWgIWmefZ%2B8Gkn9QRg%3D%3D; CURRENT_FNVAL=4048; buvid_fp=ae1a9ed67ce4b4b81601b1428b3217f9; rpdid=|(k|lk~umYk)0J'u~um~Y|lku; SESSDATA=19c9c747%2C1735394649%2Cde0b5%2A72CjCGWwVQmSRSWbmZZg7l2IVicrrImP6KbqADlDdybFU35A2pNwxPiQvGuSWVPFCvg4sSVjNCOVZTU3ZjeXhGcTYwc1RZSWo1U2E2clc2WkJwdE0zREh2SnRnaldDcGhDUTZOVnF2TVdIWVZvVXBuNnJGeWs5MlhvNFdraGZpaGFwbVNvSXlWSERRIIEC; bili_jct=b7b181986a9d1a649b30eb2fa5929d52; DedeUserID=1604567014; DedeUserID__ckMd5=e19ded6239c7cd47; buvid3=3D25DBF7-1426-D2BB-E102-DB9D3B6CE68458040infoc; b_nut=1719933858; enable_web_push=DISABLE; header_theme_version=CLOSE; LIVE_BUVID=AUTO8417216581143780; fingerprint=4a6a977133f772625df973e6fe4b955c; bp_video_offset_1604567014=968870267503771648; hit-dyn-v2=1; CURRENT_QUALITY=80; _uuid=DEA107613-17109-A8AA-9E27-E8BA6FBFC103E89699infoc; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjU3OTUyODcsImlhdCI6MTcyNTUzNjAyNywicGx0IjotMX0.4xQbg9vPDx7yeSRK4xst2CzX32eMjPSMdlfFyeUn6_g; bili_ticket_expires=1725795227; PVID=8; bp_t_offset_1604567014=974079272559837184; b_lsid=351B10593_191C81CD95D; home_feed_column=4; browser_resolution=978-740; sid=6zbnnd1f"
}
response = requests.get(url=url, headers=headers)
response.encoding = 'utf-8'
#print(response.text)
for j in re.findall(r'<d p=".*?">(.*?)</d>',response.text):
content_list.append(j)
#print(content_list)
time.sleep(0.5)
print("弹幕数量",len(content_list))
except requests.RequestException as e:
print(e)
#生成弹幕txt
def create_txt():
for content in content_list:
with open('弹幕567.txt', mode='a', encoding='utf-8') as f:
f.write(content)
f.write('\n')
#print(content)
#生成弹幕excel
def create_excel():
writeData = {
'弹幕': content_list
}
fwrite = pds.DataFrame(writeData)
fwrite.to_excel("./弹幕567.xlsx", index=False)
#寻找AI相关弹幕以及前8弹幕
def AI_danmu_and_top_eight_danmu():
# 设置文件路径和表名
file_path = '弹幕567.xlsx'
sheet_name = 'Sheet1'
# 定义要搜索的关键词
keyword = 'AI'
# 读取Excel文件
df = pds.read_excel(file_path, sheet_name=sheet_name)
# 筛选包含关键词的行
filtered_df = df[df['弹幕'].str.contains(keyword, na=False, case=False)]
# 计算每种内容的出现次数(不去重,直接计数)
content_counts = filtered_df['弹幕'].value_counts()
# 将结果转换为DataFrame并重置索引
content_counts_df = content_counts.reset_index()
content_counts_df.columns = ['Text', 'Total_Count']
# 写入新的Excel文件
output_file_path = 'AI相关弹幕.xlsx'
content_counts_df.to_excel(output_file_path, index=False)
print(f"包含关键词'{keyword}'的内容及其总数量已写入到'{output_file_path}'")
# 对出现次数进行排序,并选择前八个
top_8_counts = content_counts.nlargest(8)
# 将结果转换为DataFrame并重置索引
content_counts_df1 = top_8_counts.reset_index()
content_counts_df1.columns = ['Text', 'Total_Count']
# 写入新的Excel文件
output_file_path = 'AI相关弹幕_前8.xlsx'
content_counts_df1.to_excel(output_file_path, index=False)
print(f"包含关键词'{keyword}'的内容及其前八个总数量已写入到'{output_file_path}'")
#词云图
def create_wordcloud():
# 需要排除的关键词列表
exclude_words = set(['', '', '','', '', '', '', '', '', '', '', '', '', '', '这个','不是','真的','我们', '你们', '他们'])
#mask = Image.open(r"D:\b站弹幕爬虫\mask.png").convert("L")
# 读取文件内容
with open('弹幕567.txt', encoding='utf-8') as f:
txt = f.read()
# 分词并排除指定关键词
words = jieba.lcut(txt)
filtered_words = [word for word in words if word not in exclude_words and len(word) > 1] # 排除太短或指定的词
string = ' '.join(filtered_words)
# 创建词云对象
wc = WordCloud(
width=1000,
height=700,
background_color='white',
colormap='viridis',
#mask=mask,
font_path='msyh.ttc'
)
# 生成词云
wc.generate(string)
plt.imshow(wc, interpolation='bilinear')
plt.axis('off') # 不显示坐标轴
plt.show()
# 保存词云到文件
wc.to_file('词云567.png')
if __name__ == '__main__':
bvid_get(pages,count)
oid_get()
danmu_get()
create_txt()
create_excel()
AI_danmu_and_top_eight_danmu()
create_wordcloud()
Loading…
Cancel
Save