You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

93 lines
3.4 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import requests
from bs4 import BeautifulSoup
import re
import jieba # pip install jieba
import wordcloud # pip install wordcloud
import imageio #
import matplotlib.pyplot as plt
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0' }
# 获取搜索结果页面的内容
def get_search_page(search_url):
response = requests.get(search_url, headers=headers)
response.raise_for_status() #
return response.text
# 提取页面中所有视频的链接
def extract_video_links(page_content):
soup = BeautifulSoup(page_content, 'html.parser')
video_links = []
# 选择器根据实际网页结构可能需要调整
for a_tag in soup.select(".video-list.row div.bili-video-card > div > a"):
link = a_tag.get('href')
video_links.append(link)
return video_links
# 提取视频的BV号
def extract__BV(video_url):
video_id_match = re.search(r'/video/([^/]+)', video_url)
if video_id_match:
return video_id_match.group(1)
return None
def get_cid_from_bv(bv_id):
# 视频详情 API 地址
video_url = f'https://api.bilibili.com/x/web-interface/view?bvid={bv_id}'
# 发送请求
response = requests.get(video_url, headers=headers)
response.raise_for_status()
data = response.json()
# 提取 cid
if data.get('code') == 0:
cid = data.get('data', {}).get('cid')
return cid
else:
return None
def get_danmu(id):
video_url = f'https://api.bilibili.com/x/v1/dm/list.so?oid={id}'
response = requests.get(video_url, headers=headers) #
response.encoding='utf-8' #
html = response.text
soup = BeautifulSoup(html) #使beautifulsoup
all_txt = soup.findAll("d") #d
txt=[all_txts.attrs ["p"]for all_txts in all_txt] #dpid
txtss=[all_txts.string for all_txts in all_txt] #d
txtsss=[txts.replace(' ','') for txts in txtss] #
return(txtsss) ###便
# 主函数
def main(search_url):
page_content = get_search_page(search_url)
video_links = extract_video_links(page_content)
bvs = []
for link in video_links:
bv = extract__BV(link)
if bv:
bvs.append(bv)
cids = []
for bv in bvs:
cid = get_cid_from_bv(bv)
cids.append(cid)
return(cids)
# 示例搜索页 URL需要替换为实际的搜索页 URL
search_url = 'https://search.bilibili.com/all?keyword=2024巴黎奥运会'
aa = main(search_url)
alltxt=[]
for id in aa:
txt = get_danmu(id)
alltxt=alltxt + txt
danmustr=''.join(i for i in alltxt) #
words=list(jieba.cut(danmustr)) ###jieba
words=[i for i in words if len(i)>1] ###1
wc=wordcloud.WordCloud(height=1000,width=1000,font_path='simsun.ttc')#wordcloud
wc.generate(' '.join(words)) ##
print(wc)
plt.imshow(wc)
plt.show()