You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
158 lines
9.6 KiB
158 lines
9.6 KiB
2 months ago
|
import requests
|
||
|
import json
|
||
|
import re
|
||
|
from lxml import etree
|
||
|
import pandas as pd
|
||
|
from collections import Counter
|
||
|
from wordcloud import WordCloud, STOPWORDS
|
||
|
from PIL import Image
|
||
|
import numpy as np
|
||
|
|
||
|
|
||
|
def fetch_videos(keyword, num_videos=300):
|
||
|
search_url = f"https://search.bilibili.com/all?keyword={keyword}"
|
||
|
|
||
|
headers = {
|
||
|
"cookie" : "buvid_fp_plain=undefined; DedeUserID=478046906; DedeUserID__ckMd5=e069fb2f7c7e45d8; LIVE_BUVID=AUTO5416552130626367; buvid4=A2EA20E9-E779-847F-39B4-938CD4287F1714203-022061113-BEFmx%2F6H9VrRuwt93E6aCjRHU0GpnkXpk1pE1uK5mJmVSDNUChtrag%3D%3D; CURRENT_FNVAL=4048; header_theme_version=CLOSE; enable_web_push=DISABLE; home_feed_column=5; PVID=1; rpdid=|(RYk~~RkJY0J'u~|JYRkR|R; FEED_LIVE_VERSION=V_HEADER_LIVE_NEW_POP; buvid3=4799A87F-A9DC-AA07-27BF-82F3A5F664F239018infoc; b_nut=1720075339; _uuid=668710B22-5122-4109F-5104D-74B5666E10F1837667infoc; CURRENT_QUALITY=80; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY0MDMyNzgsImlhdCI6MTcyNjE0NDAxOCwicGx0IjotMX0.DQbK-ZNXUzxZXH3Z5cZzBsvahbrDIGLhTVNtZn_q2Fs; bili_ticket_expires=1726403218; SESSDATA=8b95e9ea%2C1741745313%2C0d10f%2A91CjCH0RwJFik2-i32G8hIuFSVDWydWuXvE1N-D4N9IoB01Z1nhfZSlOu-FUStWixjMPwSVm1XLXhhVVlsLVVILTA3cW1JVHJMUWFvUzYxMktjRHNtcUJHV29CWUpCNzFkM2NESU5vRzgzM3JYSDNJelF1QkFHaklNQmpITXA5YzdZTWhYUjRMcUVnIIEC; bili_jct=2b74ad2d258f25b4bc6e4d71843bdfb0; sid=4p5183l0; bp_t_offset_478046906=976512964238508032; fingerprint=271eb250ccf16737050c17694464d5fe; b_lsid=29DAAAB7_191EA6E45AE; browser_resolution=1528-151; buvid_fp=4799A87F-A9DC-AA07-27BF-82F3A5F664F239018infoc",
|
||
|
"user-agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0" ,
|
||
|
"referer" : "https: // www.bilibili.com /"
|
||
|
}
|
||
|
response = requests.get(url = search_url, headers=headers)
|
||
|
# response.encoding = 'utf-8'
|
||
|
# html_data = response.text
|
||
|
# print(html_data) 检查response是否获取成功
|
||
|
|
||
|
# 获取网页内容
|
||
|
html_content = response.content
|
||
|
# 解析 HTML 内容
|
||
|
tree = etree.HTML(html_content)
|
||
|
|
||
|
result = []
|
||
|
for index in range(1,31):
|
||
|
div_text = tree.xpath(f'//*[@id="i_cecream"]/div/div[2]/div[2]/div/div/div/div[3]/div/div[{index}]/div/div[2]/a/@href')
|
||
|
result.append(div_text[0])
|
||
|
# print(div_text[0]) #获取视频链接
|
||
|
# //*[@id="i_cecream"]/div/div[2]/div[2]/div/div/div/div[3]/div/div[1]/div/div[2]/a
|
||
|
# //*[@id="i_cecream"]/div/div[2]/div[2]/div/div/div/div[3]/div/div[2]/div/div[2]/a
|
||
|
# //*[@id="i_cecream"]/div/div[2]/div[2]/div/div/div/div[3]/div/div[3]/div/div[2]/a
|
||
|
#xpath路径
|
||
|
page_size = 30
|
||
|
page = num_videos // 30
|
||
|
for index in range(1,page):
|
||
|
search_url = f"https://search.bilibili.com/all?keyword={keyword}&page={index+1}&o={index*page_size}"
|
||
|
response = requests.get(url=search_url, headers=headers)
|
||
|
html_content = response.content
|
||
|
tree = etree.HTML(html_content)
|
||
|
for index in range(1, 31):
|
||
|
div_text = tree.xpath(f'//*[@id="i_cecream"]/div/div[2]/div[2]/div/div/div[1]/div[{index}]/div/div[2]/a/@href')
|
||
|
result.append(div_text[0])
|
||
|
#第二页及后续与第一页xpath结构不同需重新写
|
||
|
#//*[@id="i_cecream"]/div/div[2]/div[2]/div/div/div[1]/div[1]/div/div[2]/a
|
||
|
#//*[@id="i_cecream"]/div/div[2]/div[2]/div/div/div[1]/div[2]/div/div[2]/a
|
||
|
#print(div_text[0]) #检测是否成功
|
||
|
return result
|
||
|
|
||
|
|
||
|
def fetch_bullet(url): #正则表达式
|
||
|
|
||
|
headers = {
|
||
|
"cookie" : "buvid_fp_plain=undefined; DedeUserID=478046906; DedeUserID__ckMd5=e069fb2f7c7e45d8; LIVE_BUVID=AUTO5416552130626367; buvid4=A2EA20E9-E779-847F-39B4-938CD4287F1714203-022061113-BEFmx%2F6H9VrRuwt93E6aCjRHU0GpnkXpk1pE1uK5mJmVSDNUChtrag%3D%3D; CURRENT_FNVAL=4048; header_theme_version=CLOSE; enable_web_push=DISABLE; home_feed_column=5; PVID=1; rpdid=|(RYk~~RkJY0J'u~|JYRkR|R; FEED_LIVE_VERSION=V_HEADER_LIVE_NEW_POP; buvid3=4799A87F-A9DC-AA07-27BF-82F3A5F664F239018infoc; b_nut=1720075339; _uuid=668710B22-5122-4109F-5104D-74B5666E10F1837667infoc; CURRENT_QUALITY=80; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY0MDMyNzgsImlhdCI6MTcyNjE0NDAxOCwicGx0IjotMX0.DQbK-ZNXUzxZXH3Z5cZzBsvahbrDIGLhTVNtZn_q2Fs; bili_ticket_expires=1726403218; SESSDATA=8b95e9ea%2C1741745313%2C0d10f%2A91CjCH0RwJFik2-i32G8hIuFSVDWydWuXvE1N-D4N9IoB01Z1nhfZSlOu-FUStWixjMPwSVm1XLXhhVVlsLVVILTA3cW1JVHJMUWFvUzYxMktjRHNtcUJHV29CWUpCNzFkM2NESU5vRzgzM3JYSDNJelF1QkFHaklNQmpITXA5YzdZTWhYUjRMcUVnIIEC; bili_jct=2b74ad2d258f25b4bc6e4d71843bdfb0; sid=4p5183l0; fingerprint=271eb250ccf16737050c17694464d5fe; browser_resolution=1528-786; buvid_fp=271eb250ccf16737050c17694464d5fe; b_lsid=89A9842E_191EBAF4B62; bsource=search_bing; bp_t_offset_478046906=976675180858310656",
|
||
|
"user-agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0"
|
||
|
}
|
||
|
|
||
|
response = requests.get(url = url, headers = headers)
|
||
|
# 获取网页内容
|
||
|
html_content = response.content
|
||
|
# 解析 HTML 内容
|
||
|
tree = etree.HTML(html_content)
|
||
|
|
||
|
list_url = tree.xpath("//*[@id=\"dtl\"]/div[5]/input/@value")
|
||
|
#得到['https://api.bilibili.com/x/v1/dm/list.so?oid=1633670800']
|
||
|
|
||
|
# ['https://api.bilibili.com/x/v1/dm/list.so?oid=1630123093']
|
||
|
# ['https://api.bilibili.com/x/v1/dm/list.so?oid=1633866069']
|
||
|
# [] #部分通过xpath无法得到链接 改用正则表达式
|
||
|
|
||
|
#使用得到的弹幕链接获取弹幕
|
||
|
bullet_url = None
|
||
|
if list_url:
|
||
|
bullet_url = list_url[0]
|
||
|
response = requests.get(url= bullet_url, headers= headers)
|
||
|
response.encoding = 'utf-8'
|
||
|
html_data = response.text
|
||
|
|
||
|
#正则表达式获取弹幕(返回列表形式)
|
||
|
content_list = re.findall('<d p=".*?">(.*?)</d>', html_data)
|
||
|
#列表合成字符串
|
||
|
content = '\n'.join(content_list)
|
||
|
#print(content) #成功获取弹幕
|
||
|
|
||
|
#将弹幕写入文件
|
||
|
with open('bullet.txt', mode= 'a', encoding='utf-8') as f:
|
||
|
f.write(content) #检查后没问题进入下一步
|
||
|
|
||
|
|
||
|
def keyword():
|
||
|
with open('bullet.txt', mode='r', encoding='utf-8') as f:
|
||
|
text = f.read()
|
||
|
# 预处理文本:去除标点符号,转换为小写
|
||
|
text = re.sub(r'[^\w\s]', '', text).lower()
|
||
|
|
||
|
# 计算关键词频率
|
||
|
words = ['科学', '深度学习','语音识别','数据挖掘','自动驾驶','机器人','图像识别','自然语言处理','算法','模型','语义理解','建模','算法推理','智能','芯片','智能算法','机器人技术','神经网络','特征提取','自适应学习','神经网络','异常检测','智能控制','数据清洗','多模态','强化学习', '语音合成','数据融合','模型', '实时处理','人工智能应用','深度生成', '自监督', '数据分类', '自动', '迁移学习', '智能预测', '虚拟现实', '机器视觉', '算法优化', '模式识别', '语义分割', '生成对抗网络','智能识别', '智能检索', '边缘计算', '模型推理', '数据标注','神经架构搜索','语言模型','智能推荐','自动化','模式匹配','智能助理','计算机视觉','ai']
|
||
|
|
||
|
#计算每个词的频率
|
||
|
word_counts = Counter()
|
||
|
for word in words:
|
||
|
word_counts[word] = text.count(word)
|
||
|
# 提取前八个关键词及其频率
|
||
|
top_eight = word_counts.most_common(8)
|
||
|
|
||
|
# 将结果导入Excel
|
||
|
df = pd.DataFrame(top_eight, columns=['Keyword', 'Frequency'])
|
||
|
df.to_excel('keyword_frequencies.xlsx', index=False)
|
||
|
|
||
|
# print(top_eight) #[('ai', 55), ('科学', 42), ('自动', 17), ('芯片', 6), ('机器人', 5), ('算法', 4), ('智能', 3), ('建模', 2)]
|
||
|
|
||
|
# # 将元组列表转换为字典
|
||
|
# word_freq_dict = dict(top_eight)
|
||
|
mask_img = np.array(Image.open("mask.jpg"))
|
||
|
|
||
|
# 设置停用词
|
||
|
stopwords = set(STOPWORDS).union({"哈哈","哈哈哈","哈哈哈哈", "哈哈哈哈哈", "哈哈哈哈哈哈", "哈哈哈哈哈哈哈", "哈哈哈哈哈哈哈哈哈","哈哈哈哈哈哈哈哈哈","哈哈哈哈哈哈哈哈哈哈","哈哈哈哈哈哈哈哈哈哈哈哈哈","哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈","哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈","哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈"})
|
||
|
# 创建词云对象
|
||
|
wordcloud = WordCloud(font_path='msyh.ttc', # 可以指定字体路径,如果使用中文字符可能需要指定合适的字体路径
|
||
|
width=2750,
|
||
|
height=1700,
|
||
|
mask=mask_img,
|
||
|
background_color='white',
|
||
|
stopwords=stopwords).generate(text)
|
||
|
|
||
|
# 保存词云图到文件
|
||
|
|
||
|
wordcloud.to_file('wordcloud.png')
|
||
|
|
||
|
|
||
|
def main():
|
||
|
videos_url = fetch_videos("巴黎奥运会", 300)
|
||
|
for index in range(0,300):
|
||
|
fetch_bullet('http://www.i' + videos_url[index][6:])
|
||
|
keyword()
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
main()
|
||
|
#search_url = fetch_videos("巴黎奥运会", 300)
|
||
|
# for index in range(0,300):
|
||
|
# print(search_url[index]) #打印值://www.bilibili.com/video/BV1Kz421i71x/ 注意!!!
|
||
|
|
||
|
# videos_url = fetch_videos("巴黎奥运会", 300)
|
||
|
# for index in range(0,300):
|
||
|
# # print(videos_url[index])
|
||
|
# # print('http://www.i'+videos_url[index][6:])
|
||
|
#
|
||
|
# fetch_bullet('http://www.i'+videos_url[index][6:])
|
||
|
#获取弹幕
|