You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

149 lines
5.2 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import requests
from bs4 import BeautifulSoup
import re
import time
import random
import jieba
import wordcloud
import matplotlib.pyplot as plt
import pandas as pd
from pandas import ExcelWriter
from collections import Counter
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0',
}
keywords = [
'AI', '人工智能', '机器学习', '深度学习', '神经网络', '自动',
'算法', '数据科学', '自然语言', '计算机', '人工智能技术',
'大数据', '预测分析', '机器视觉',
'智能', '计算机', '人工智能应用',
'数据分析', '情感计算', 'ai']
videosnumber = 0
# 获取搜索结果页面的内容
def get_search_page(search_url):
response = requests.get(search_url, headers=headers)
response.raise_for_status() # 确保请求成功
return response.text
# 提取页面中所有视频的链接
def extract_video_links(page_content):
soup = BeautifulSoup(page_content, 'html.parser')
video_links = []
for a_tag in soup.select(".video-list.row div.bili-video-card > div > a"):
link = a_tag.get('href')
video_links.append(link)
return video_links
# 提取视频的BV号
def extract__BV(video_urls):
links=[]
for video_url in video_urls:
video_id_match = re.search(r'/video/([^/]+)', video_url)
if video_id_match:
links.append(video_id_match.group(1))
return links
# 将视频BV号转为CID
def get_cid_from_bv(bv_ids):
cids=[]
for bv_id in bv_ids:
# 视频详情 API 地址
video_url = f'https://api.bilibili.com/x/web-interface/view?bvid={bv_id}'
# 发送请求
response = requests.get(video_url, headers=headers)
response.raise_for_status()
data = response.json()
# 提取 cid
if data.get('code') == 0:
cid = data.get('data', {}).get('cid')
cids.append(cid)
return cids
#获取弹幕
def get_danmu(id):
global videosnumber
video_url = f'https://api.bilibili.com/x/v1/dm/list.so?oid={id}'
response = requests.get(video_url, headers=headers) #要爬取的网址
response.encoding='utf-8' #编码方式
html = response.text
soup = BeautifulSoup(html) #使用beautifulsoup库快速查找我们想要的信息
all_txt = soup.findAll("d") #寻找到所有包含d的行
txt=[all_txts.attrs ["p"]for all_txts in all_txt] #寻找到所有包含d的行中属性为p的值这里边包含了弹幕的虚拟id等
txtss=[all_txts.string for all_txts in all_txt] #寻找到所有包含d的行中的字符串数据即弹幕内容
txtsss=[txts.replace(' ','') for txts in txtss] #将字符串中的空格消除掉
videosnumber = videosnumber +1
bulletnumber = len(txtsss)
print( f"这是第{videosnumber}视频, 获取到{bulletnumber}弹幕")
time.sleep(random.randint(0,2)+random.random())
return(txtsss) ###打印便可看见一条条弹幕的属性和内容了。
#翻页
def page(url,num):
num=num+1
url=f'https://search.bilibili.com/video?keyword=2024巴黎奥运会&page={num}'
return url
#处理弹幕
def wcloud(alltxt):
danmustr=''.join(i for i in alltxt) #将所有弹幕拼接在一起
words=list(jieba.cut(danmustr)) ###利用jieba库将弹幕按词进行切分
words=[i for i in words if len(i)>1] ###挑出长度大于1的词语为去除诸如啊等字符
wc=wordcloud.WordCloud(height=1000,width=1000,font_path='simsun.ttc')#利用wordcloud库定义词云图片的信息
wc.generate(' '.join(words)) ##生成图片
print(wc)
plt.imshow(wc)
plt.show()
def sort(txt, keywords):
comment_counter = Counter()
for line in txt:
if any(word in keywords for word in jieba.cut(line)):
comment_counter[line] += 1
return comment_counter
#存入excel
def save_to_excel(danmu_data, filename='danmu_data.xlsx'):
# 创建 DataFrame
df = pd.DataFrame(danmu_data, columns=['弹幕'])
# 保存到 Excel
with ExcelWriter(filename, engine='openpyxl') as writer:
df.to_excel(writer, index=False)
# 主函数
def main(kword,mubiao):
search_url= f'https://search.bilibili.com/video?keyword={kword}'
for i in range(100):
search_url=page(search_url,i)
page_content = get_search_page(search_url)
video_links = extract_video_links(page_content)
bvs = extract__BV(video_links)
alltxt=[]
cids = []
cids = get_cid_from_bv(bvs)
for id in cids:
if(videosnumber>=mubiao): break
txt = get_danmu(id)
alltxt=alltxt + txt
if(videosnumber>=mubiao): break
return(alltxt)
# 示例搜索页 URL需要替换为实际的搜索页 URL
keword = "2024巴黎奥运会" #视频关键词
flag = 10 #你要爬的视频数量
alltxt=main(keword,flag)
wcloud(alltxt)
save_to_excel(alltxt)
comment_counter = sort(alltxt, keywords)
top_comments = comment_counter.most_common(8)
# 输出排名前8的AI相关弹幕
for comment, count in top_comments:
print(f'弹幕: {comment}, 数量: {count}')