From 58a8049fd39ff5cfad5d0077e936db8918e91d3d Mon Sep 17 00:00:00 2001 From: QMZ <1164250597@qq.com> Date: Mon, 16 Sep 2024 16:34:18 +0800 Subject: [PATCH] =?UTF-8?q?demo=204.0=20=E5=AE=9E=E7=8E=B0=E5=9F=BA?= =?UTF-8?q?=E6=9C=AC=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- demo.py | 49 ++++++++++++++++++++++++++++++++++++------------- 1 file changed, 36 insertions(+), 13 deletions(-) diff --git a/demo.py b/demo.py index ece161a..ec97928 100644 --- a/demo.py +++ b/demo.py @@ -3,10 +3,12 @@ from bs4 import BeautifulSoup import re import time import random -import jieba # 结巴分词 pip install jieba -import wordcloud # 词云图 pip install wordcloud -import imageio # 读取本地图片 修改词云图形 +import jieba +import wordcloud import matplotlib.pyplot as plt +import pandas as pd +from pandas import ExcelWriter +from collections import Counter headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0', @@ -16,10 +18,6 @@ keywords = [ '算法', '数据科学', '智能算法', '自然语言处理', '计算机视觉', '智能机器人', '智能系统', '人工智能技术', 'AI技术', 'AI应用', '智能设备', '智能分析', 'AI模型', '大数据', '预测分析', - '模式识别', '语音识别', '图像识别', '机器人技术', '数据挖掘', - '智能决策', '虚拟助手', '增强现实', '计算智能', '自适应系统', - '智能网络', '知识图谱', '智能交互', 'AI解决方案', '计算机智能', - '自然语言生成', '深度神经网络', '强化学习', '迁移学习', '生成对抗网络', '智能预测', '智慧城市', '智能制造', '机器视觉', '自动驾驶', '智能传感器', '智能控制', '智能推荐', '计算机科学', '人工智能应用', '人工智能发展', 'AI伦理', '人工智能安全', '智能算法应用', '数据分析', @@ -37,7 +35,6 @@ def get_search_page(search_url): def extract_video_links(page_content): soup = BeautifulSoup(page_content, 'html.parser') video_links = [] - # 选择器根据实际网页结构可能需要调整 for a_tag in soup.select(".video-list.row div.bili-video-card > div > a"): link = a_tag.get('href') video_links.append(link) @@ -53,6 +50,7 @@ def extract__BV(video_urls): links.append(video_id_match.group(1)) return links +# 将视频BV号转为CID def get_cid_from_bv(bv_ids): cids=[] for bv_id in bv_ids: @@ -68,7 +66,8 @@ def get_cid_from_bv(bv_ids): cid = data.get('data', {}).get('cid') cids.append(cid) return cids - + +#获取弹幕 def get_danmu(id): global videosnumber video_url = f'https://api.bilibili.com/x/v1/dm/list.so?oid={id}' @@ -86,11 +85,13 @@ def get_danmu(id): time.sleep(random.randint(0,2)+random.random()) return(txtsss) ###打印便可看见一条条弹幕的属性和内容了。 +#翻页 def page(url,num): num=num+1 url=f'https://search.bilibili.com/video?keyword=2024巴黎奥运会&page={num}' return url +#处理弹幕 def chuli(alltxt): danmustr=''.join(i for i in alltxt) #将所有弹幕拼接在一起 words=list(jieba.cut(danmustr)) ###利用jieba库将弹幕按词进行切分 @@ -101,11 +102,29 @@ def chuli(alltxt): plt.imshow(wc) plt.show() +def sort(txt, keywords): + + comment_counter = Counter() + for line in txt: + if any(word in keywords for word in jieba.cut(line)): + comment_counter[line] += 1 + + return comment_counter + +comment_counter = sort(alltxt, keywords) +top_comments = comment_counter.most_common(8) #输出排名前八 + +#存入excel +def save_to_excel(danmu_data, filename='danmu_data.xlsx'): + # 创建 DataFrame + df = pd.DataFrame(danmu_data, columns=['弹幕']) + # 保存到 Excel + with ExcelWriter(filename, engine='openpyxl') as writer: + df.to_excel(writer, index=False) # 主函数 def main(kword,mubiao): search_url= f'https://search.bilibili.com/video?keyword={kword}' - print(search_url) for i in range(100): search_url=page(search_url,i) page_content = get_search_page(search_url) @@ -122,9 +141,13 @@ def main(kword,mubiao): return(alltxt) # 示例搜索页 URL(需要替换为实际的搜索页 URL) -keword = "2024巴黎奥运会" -flag = 5 #你要爬的视频数量 +keword = "2024巴黎奥运会" #视频关键词 +flag = 300 #你要爬的视频数量 + alltxt=main(keword,flag) chuli(alltxt) +save_to_excel(alltxt) - +# 输出排名前8的AI相关弹幕 +for comment, count in top_comments: + print(f'弹幕: {comment}, 数量: {count}') \ No newline at end of file