diff --git a/demo.py b/demo.py index cbc42dc..f94597b 100644 --- a/demo.py +++ b/demo.py @@ -2,8 +2,8 @@ import requests from bs4 import BeautifulSoup import re import time -import random -import jieba +import jieba +import random import wordcloud import matplotlib.pyplot as plt import pandas as pd @@ -73,13 +73,12 @@ def get_danmu(id): html = response.text soup = BeautifulSoup(html) #使用beautifulsoup库快速查找我们想要的信息 all_txt = soup.findAll("d") #寻找到所有包含d的行 - txt=[all_txts.attrs ["p"]for all_txts in all_txt] #寻找到所有包含d的行中属性为p的值,这里边包含了弹幕的虚拟id等 txtss=[all_txts.string for all_txts in all_txt] #寻找到所有包含d的行中的字符串数据,即弹幕内容 txtsss=[txts.replace(' ','') for txts in txtss] #将字符串中的空格消除掉 videosnumber = videosnumber +1 bulletnumber = len(txtsss) print( f"这是第{videosnumber}视频, 获取到{bulletnumber}弹幕") - time.sleep(random.randint(0,2)+random.random()) + time.sleep(random.randint(0,1)) return(txtsss) ###打印便可看见一条条弹幕的属性和内容了。 #翻页 @@ -88,10 +87,10 @@ def page(url,num): url=f'https://search.bilibili.com/video?keyword=2024巴黎奥运会&page={num}' return url -#处理弹幕 +#词云图 def wcloud(alltxt): danmustr=''.join(i for i in alltxt) #将所有弹幕拼接在一起 - words=list(jieba.cut(danmustr)) ###利用jieba库将弹幕按词进行切分 + words=list(jieba.cut(danmustr)) #利用jieba库将弹幕按词进行切分 words=[i for i in words if len(i)>1] ###挑出长度大于1的词语(为去除诸如?,哈,啊等字符) wc=wordcloud.WordCloud(height=1000,width=1000,font_path='simsun.ttc')#利用wordcloud库定义词云图片的信息 wc.generate(' '.join(words)) ##生成图片 @@ -99,6 +98,7 @@ def wcloud(alltxt): plt.imshow(wc) plt.show() +#提取AI相关弹幕 def sort(txt, keywords): comment_counter = Counter() @@ -119,13 +119,13 @@ def save_to_excel(danmu_data, filename='danmu_data.xlsx'): # 主函数 def main(kword,mubiao): + alltxt=[] search_url= f'https://search.bilibili.com/video?keyword={kword}' for i in range(100): search_url=page(search_url,i) page_content = get_search_page(search_url) video_links = extract_video_links(page_content) bvs = extract__BV(video_links) - alltxt=[] cids = [] cids = get_cid_from_bv(bvs) for id in cids: @@ -137,7 +137,7 @@ def main(kword,mubiao): # 示例搜索页 URL(需要替换为实际的搜索页 URL) keword = "2024巴黎奥运会" #视频关键词 -flag = 10 #你要爬的视频数量 +flag = 300 #你要爬的视频数量 alltxt=main(keword,flag) wcloud(alltxt)