应该是完整的最终版本

main
QMZ 2 months ago
parent d77fedd44e
commit 96f586c49a

@ -2,8 +2,8 @@ import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import re import re
import time import time
import random
import jieba import jieba
import random
import wordcloud import wordcloud
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import pandas as pd import pandas as pd
@ -73,13 +73,12 @@ def get_danmu(id):
html = response.text html = response.text
soup = BeautifulSoup(html) #使用beautifulsoup库快速查找我们想要的信息 soup = BeautifulSoup(html) #使用beautifulsoup库快速查找我们想要的信息
all_txt = soup.findAll("d") #寻找到所有包含d的行 all_txt = soup.findAll("d") #寻找到所有包含d的行
txt=[all_txts.attrs ["p"]for all_txts in all_txt] #寻找到所有包含d的行中属性为p的值这里边包含了弹幕的虚拟id等
txtss=[all_txts.string for all_txts in all_txt] #寻找到所有包含d的行中的字符串数据即弹幕内容 txtss=[all_txts.string for all_txts in all_txt] #寻找到所有包含d的行中的字符串数据即弹幕内容
txtsss=[txts.replace(' ','') for txts in txtss] #将字符串中的空格消除掉 txtsss=[txts.replace(' ','') for txts in txtss] #将字符串中的空格消除掉
videosnumber = videosnumber +1 videosnumber = videosnumber +1
bulletnumber = len(txtsss) bulletnumber = len(txtsss)
print( f"这是第{videosnumber}视频, 获取到{bulletnumber}弹幕") print( f"这是第{videosnumber}视频, 获取到{bulletnumber}弹幕")
time.sleep(random.randint(0,2)+random.random()) time.sleep(random.randint(0,1))
return(txtsss) ###打印便可看见一条条弹幕的属性和内容了。 return(txtsss) ###打印便可看见一条条弹幕的属性和内容了。
#翻页 #翻页
@ -88,10 +87,10 @@ def page(url,num):
url=f'https://search.bilibili.com/video?keyword=2024巴黎奥运会&page={num}' url=f'https://search.bilibili.com/video?keyword=2024巴黎奥运会&page={num}'
return url return url
#处理弹幕 #词云图
def wcloud(alltxt): def wcloud(alltxt):
danmustr=''.join(i for i in alltxt) #将所有弹幕拼接在一起 danmustr=''.join(i for i in alltxt) #将所有弹幕拼接在一起
words=list(jieba.cut(danmustr)) ###利用jieba库将弹幕按词进行切分 words=list(jieba.cut(danmustr)) #利用jieba库将弹幕按词进行切分
words=[i for i in words if len(i)>1] ###挑出长度大于1的词语为去除诸如啊等字符 words=[i for i in words if len(i)>1] ###挑出长度大于1的词语为去除诸如啊等字符
wc=wordcloud.WordCloud(height=1000,width=1000,font_path='simsun.ttc')#利用wordcloud库定义词云图片的信息 wc=wordcloud.WordCloud(height=1000,width=1000,font_path='simsun.ttc')#利用wordcloud库定义词云图片的信息
wc.generate(' '.join(words)) ##生成图片 wc.generate(' '.join(words)) ##生成图片
@ -99,6 +98,7 @@ def wcloud(alltxt):
plt.imshow(wc) plt.imshow(wc)
plt.show() plt.show()
#提取AI相关弹幕
def sort(txt, keywords): def sort(txt, keywords):
comment_counter = Counter() comment_counter = Counter()
@ -119,13 +119,13 @@ def save_to_excel(danmu_data, filename='danmu_data.xlsx'):
# 主函数 # 主函数
def main(kword,mubiao): def main(kword,mubiao):
alltxt=[]
search_url= f'https://search.bilibili.com/video?keyword={kword}' search_url= f'https://search.bilibili.com/video?keyword={kword}'
for i in range(100): for i in range(100):
search_url=page(search_url,i) search_url=page(search_url,i)
page_content = get_search_page(search_url) page_content = get_search_page(search_url)
video_links = extract_video_links(page_content) video_links = extract_video_links(page_content)
bvs = extract__BV(video_links) bvs = extract__BV(video_links)
alltxt=[]
cids = [] cids = []
cids = get_cid_from_bv(bvs) cids = get_cid_from_bv(bvs)
for id in cids: for id in cids:
@ -137,7 +137,7 @@ def main(kword,mubiao):
# 示例搜索页 URL需要替换为实际的搜索页 URL # 示例搜索页 URL需要替换为实际的搜索页 URL
keword = "2024巴黎奥运会" #视频关键词 keword = "2024巴黎奥运会" #视频关键词
flag = 10 #你要爬的视频数量 flag = 300 #你要爬的视频数量
alltxt=main(keword,flag) alltxt=main(keword,flag)
wcloud(alltxt) wcloud(alltxt)

Loading…
Cancel
Save