You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

158 lines
9.6 KiB

import requests
import json
import re
from lxml import etree
import pandas as pd
from collections import Counter
from wordcloud import WordCloud, STOPWORDS
from PIL import Image
import numpy as np
def fetch_videos(keyword, num_videos=300):
search_url = f"https://search.bilibili.com/all?keyword={keyword}"
headers = {
"cookie" : "buvid_fp_plain=undefined; DedeUserID=478046906; DedeUserID__ckMd5=e069fb2f7c7e45d8; LIVE_BUVID=AUTO5416552130626367; buvid4=A2EA20E9-E779-847F-39B4-938CD4287F1714203-022061113-BEFmx%2F6H9VrRuwt93E6aCjRHU0GpnkXpk1pE1uK5mJmVSDNUChtrag%3D%3D; CURRENT_FNVAL=4048; header_theme_version=CLOSE; enable_web_push=DISABLE; home_feed_column=5; PVID=1; rpdid=|(RYk~~RkJY0J'u~|JYRkR|R; FEED_LIVE_VERSION=V_HEADER_LIVE_NEW_POP; buvid3=4799A87F-A9DC-AA07-27BF-82F3A5F664F239018infoc; b_nut=1720075339; _uuid=668710B22-5122-4109F-5104D-74B5666E10F1837667infoc; CURRENT_QUALITY=80; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY0MDMyNzgsImlhdCI6MTcyNjE0NDAxOCwicGx0IjotMX0.DQbK-ZNXUzxZXH3Z5cZzBsvahbrDIGLhTVNtZn_q2Fs; bili_ticket_expires=1726403218; SESSDATA=8b95e9ea%2C1741745313%2C0d10f%2A91CjCH0RwJFik2-i32G8hIuFSVDWydWuXvE1N-D4N9IoB01Z1nhfZSlOu-FUStWixjMPwSVm1XLXhhVVlsLVVILTA3cW1JVHJMUWFvUzYxMktjRHNtcUJHV29CWUpCNzFkM2NESU5vRzgzM3JYSDNJelF1QkFHaklNQmpITXA5YzdZTWhYUjRMcUVnIIEC; bili_jct=2b74ad2d258f25b4bc6e4d71843bdfb0; sid=4p5183l0; bp_t_offset_478046906=976512964238508032; fingerprint=271eb250ccf16737050c17694464d5fe; b_lsid=29DAAAB7_191EA6E45AE; browser_resolution=1528-151; buvid_fp=4799A87F-A9DC-AA07-27BF-82F3A5F664F239018infoc",
"user-agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0" ,
"referer" : "https: // www.bilibili.com /"
}
response = requests.get(url = search_url, headers=headers)
# response.encoding = 'utf-8'
# html_data = response.text
# print(html_data) 检查response是否获取成功
# 获取网页内容
html_content = response.content
# 解析 HTML 内容
tree = etree.HTML(html_content)
result = []
for index in range(1,31):
div_text = tree.xpath(f'//*[@id="i_cecream"]/div/div[2]/div[2]/div/div/div/div[3]/div/div[{index}]/div/div[2]/a/@href')
result.append(div_text[0])
# print(div_text[0]) #获取视频链接
# //*[@id="i_cecream"]/div/div[2]/div[2]/div/div/div/div[3]/div/div[1]/div/div[2]/a
# //*[@id="i_cecream"]/div/div[2]/div[2]/div/div/div/div[3]/div/div[2]/div/div[2]/a
# //*[@id="i_cecream"]/div/div[2]/div[2]/div/div/div/div[3]/div/div[3]/div/div[2]/a
#xpath路径
page_size = 30
page = num_videos // 30
for index in range(1,page):
search_url = f"https://search.bilibili.com/all?keyword={keyword}&page={index+1}&o={index*page_size}"
response = requests.get(url=search_url, headers=headers)
html_content = response.content
tree = etree.HTML(html_content)
for index in range(1, 31):
div_text = tree.xpath(f'//*[@id="i_cecream"]/div/div[2]/div[2]/div/div/div[1]/div[{index}]/div/div[2]/a/@href')
result.append(div_text[0])
#第二页及后续与第一页xpath结构不同需重新写
#//*[@id="i_cecream"]/div/div[2]/div[2]/div/div/div[1]/div[1]/div/div[2]/a
#//*[@id="i_cecream"]/div/div[2]/div[2]/div/div/div[1]/div[2]/div/div[2]/a
#print(div_text[0]) #检测是否成功
return result
def fetch_bullet(url): #正则表达式
headers = {
"cookie" : "buvid_fp_plain=undefined; DedeUserID=478046906; DedeUserID__ckMd5=e069fb2f7c7e45d8; LIVE_BUVID=AUTO5416552130626367; buvid4=A2EA20E9-E779-847F-39B4-938CD4287F1714203-022061113-BEFmx%2F6H9VrRuwt93E6aCjRHU0GpnkXpk1pE1uK5mJmVSDNUChtrag%3D%3D; CURRENT_FNVAL=4048; header_theme_version=CLOSE; enable_web_push=DISABLE; home_feed_column=5; PVID=1; rpdid=|(RYk~~RkJY0J'u~|JYRkR|R; FEED_LIVE_VERSION=V_HEADER_LIVE_NEW_POP; buvid3=4799A87F-A9DC-AA07-27BF-82F3A5F664F239018infoc; b_nut=1720075339; _uuid=668710B22-5122-4109F-5104D-74B5666E10F1837667infoc; CURRENT_QUALITY=80; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY0MDMyNzgsImlhdCI6MTcyNjE0NDAxOCwicGx0IjotMX0.DQbK-ZNXUzxZXH3Z5cZzBsvahbrDIGLhTVNtZn_q2Fs; bili_ticket_expires=1726403218; SESSDATA=8b95e9ea%2C1741745313%2C0d10f%2A91CjCH0RwJFik2-i32G8hIuFSVDWydWuXvE1N-D4N9IoB01Z1nhfZSlOu-FUStWixjMPwSVm1XLXhhVVlsLVVILTA3cW1JVHJMUWFvUzYxMktjRHNtcUJHV29CWUpCNzFkM2NESU5vRzgzM3JYSDNJelF1QkFHaklNQmpITXA5YzdZTWhYUjRMcUVnIIEC; bili_jct=2b74ad2d258f25b4bc6e4d71843bdfb0; sid=4p5183l0; fingerprint=271eb250ccf16737050c17694464d5fe; browser_resolution=1528-786; buvid_fp=271eb250ccf16737050c17694464d5fe; b_lsid=89A9842E_191EBAF4B62; bsource=search_bing; bp_t_offset_478046906=976675180858310656",
"user-agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0"
}
response = requests.get(url = url, headers = headers)
# 获取网页内容
html_content = response.content
# 解析 HTML 内容
tree = etree.HTML(html_content)
list_url = tree.xpath("//*[@id=\"dtl\"]/div[5]/input/@value")
#得到['https://api.bilibili.com/x/v1/dm/list.so?oid=1633670800']
# ['https://api.bilibili.com/x/v1/dm/list.so?oid=1630123093']
# ['https://api.bilibili.com/x/v1/dm/list.so?oid=1633866069']
# [] #部分通过xpath无法得到链接 改用正则表达式
#使用得到的弹幕链接获取弹幕
bullet_url = None
if list_url:
bullet_url = list_url[0]
response = requests.get(url= bullet_url, headers= headers)
response.encoding = 'utf-8'
html_data = response.text
#正则表达式获取弹幕(返回列表形式)
content_list = re.findall('<d p=".*?">(.*?)</d>', html_data)
#列表合成字符串
content = '\n'.join(content_list)
#print(content) #成功获取弹幕
#将弹幕写入文件
with open('bullet.txt', mode= 'a', encoding='utf-8') as f:
f.write(content) #检查后没问题进入下一步
def keyword():
with open('bullet.txt', mode='r', encoding='utf-8') as f:
text = f.read()
# 预处理文本:去除标点符号,转换为小写
text = re.sub(r'[^\w\s]', '', text).lower()
# 计算关键词频率
words = ['科学', '深度学习','语音识别','数据挖掘','自动驾驶','机器人','图像识别','自然语言处理','算法','模型','语义理解','建模','算法推理','智能','芯片','智能算法','机器人技术','神经网络','特征提取','自适应学习','神经网络','异常检测','智能控制','数据清洗','多模态','强化学习', '语音合成','数据融合','模型', '实时处理','人工智能应用','深度生成', '自监督', '数据分类', '自动', '迁移学习', '智能预测', '虚拟现实', '机器视觉', '算法优化', '模式识别', '语义分割', '生成对抗网络','智能识别', '智能检索', '边缘计算', '模型推理', '数据标注','神经架构搜索','语言模型','智能推荐','自动化','模式匹配','智能助理','计算机视觉','ai']
#计算每个词的频率
word_counts = Counter()
for word in words:
word_counts[word] = text.count(word)
# 提取前八个关键词及其频率
top_eight = word_counts.most_common(8)
# 将结果导入Excel
df = pd.DataFrame(top_eight, columns=['Keyword', 'Frequency'])
df.to_excel('keyword_frequencies.xlsx', index=False)
# print(top_eight) #[('ai', 55), ('科学', 42), ('自动', 17), ('芯片', 6), ('机器人', 5), ('算法', 4), ('智能', 3), ('建模', 2)]
# # 将元组列表转换为字典
# word_freq_dict = dict(top_eight)
mask_img = np.array(Image.open("mask.jpg"))
# 设置停用词
stopwords = set(STOPWORDS).union({"哈哈","哈哈哈","哈哈哈哈", "哈哈哈哈哈", "哈哈哈哈哈哈", "哈哈哈哈哈哈哈", "哈哈哈哈哈哈哈哈哈","哈哈哈哈哈哈哈哈哈","哈哈哈哈哈哈哈哈哈哈","哈哈哈哈哈哈哈哈哈哈哈哈哈","哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈","哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈","哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈"})
# 创建词云对象
wordcloud = WordCloud(font_path='msyh.ttc', # 可以指定字体路径,如果使用中文字符可能需要指定合适的字体路径
width=2750,
height=1700,
mask=mask_img,
background_color='white',
stopwords=stopwords).generate(text)
# 保存词云图到文件
wordcloud.to_file('wordcloud.png')
def main():
videos_url = fetch_videos("巴黎奥运会", 300)
for index in range(0,300):
fetch_bullet('http://www.i' + videos_url[index][6:])
keyword()
if __name__ == "__main__":
main()
#search_url = fetch_videos("巴黎奥运会", 300)
# for index in range(0,300):
# print(search_url[index]) #打印值://www.bilibili.com/video/BV1Kz421i71x/ 注意!!!
# videos_url = fetch_videos("巴黎奥运会", 300)
# for index in range(0,300):
# # print(videos_url[index])
# # print('http://www.i'+videos_url[index][6:])
#
# fetch_bullet('http://www.i'+videos_url[index][6:])
#获取弹幕