diff --git a/py1.py b/py1.py new file mode 100644 index 0000000..a19d89d --- /dev/null +++ b/py1.py @@ -0,0 +1,137 @@ +from email.mime import image +import re #使用正则表达式分割获取弹幕 +from tkinter import Image +import requests +from bs4 import BeautifulSoup +import jieba #结巴分词pip install jieba +import wordcloud #制作词云图 +import imageio +from wordcloud.wordcloud import np #读取本地图片,修改词云图形 + +def get_outer_urls(n): + ''' + 【分页网址url采集】 + n:爬取页数 + 结果:得到分页网址的list + ''' + urllst = [] + #第一个分页网址 + urllst.append('https://search.bilibili.com/video?vt=77234042&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=3') + #其余分页网址 + for i in range(2,n+1): + url = f'https://search.bilibili.com/video?vt=77234042&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=3&page={i}&o={i*30-30}' + urllst.append(url) + return urllst + + +def get_inter_urls(ui,d_h,d_c): + ''' + 【视频页面url采集】 + ui:视频信息网页url + d_h:user-agent信息 + d_c:cookies信息 + 结果:得到一个视频页面的list + ''' + ri = requests.get(ui, headers = d_h, cookies = d_c) + soupi = BeautifulSoup(ri.text, 'lxml') + lis = soupi.find('div',class_="video-list row").find_all('div') + lst = [] + i=0 + while i(.*?)',r2.text) + + ###将每个弹幕写入文件,并记录数量 + n = 0 + for index in dmlst: + with open('D:/弹幕.txt','a',encoding='UTF-8') as f: + f.write(index) + f.write('\n') + n+=1 + + return n + + +if __name__ == '__main__': + + ###获取headers和cookies,伪装成浏览器访问网页 + dic_headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0"} + cookies = "buvid4=DCB17204-EBFC-9664-4E97-3795181B43A418148-022080117-DNbKVr13tEGcPKkOr4lL%2Fg%3D%3D; buvid3=CDE73727-EF10-ED0B-4A16-BA5D759B014208137infoc; b_nut=1698373908; _uuid=16CA10BCD-10538-D8310-8ABA-85D99510C966509403infoc; rpdid=|(u))kkYu|~Y0J'u~uJ|mYlYu; fingerprint=160ca50e14386abcf76b36d0fdd4b02d; buvid_fp_plain=undefined; buvid_fp=160ca50e14386abcf76b36d0fdd4b02d; header_theme_version=CLOSE; enable_web_push=DISABLE; home_feed_column=5; browser_resolution=1455-755; CURRENT_FNVAL=4048; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY3Mzc4NjQsImlhdCI6MTcyNjQ3ODYwNCwicGx0IjotMX0.VqGzlkNCgqurTbc0ruEj0IDl8eqJBRRhBq86ARF3Kfk; bili_ticket_expires=1726737804; bp_t_offset_1323959786=977791090671222784; b_lsid=6D45B513_191FE02FA1B; bsource=search_bing; SESSDATA=009ba743%2C1742100653%2C4a273%2A92CjAuo1OGrcH-PW3p0fHH1AKVrtbV3FsE-4oxnyl2hYcGWvzUpi1fSUrIGfYRLBOiSS8SVlhVZmxWSy1QS2ZuWGRpbDRWNjdHZmNKVVpNYjcwOEJ1UjI1R0JJNnZVTUZTczNaUmlPbk1jMlZKQnBFMEJzVmotbV9RcXdfVEVhVFloMzNBd01nYS1RIIEC; bili_jct=4314c0fd203c95101efcce860ce01817; DedeUserID=1323959786; DedeUserID__ckMd5=7e17b1885b370fe5; sid=omn05a3k" + dic_cookies = {} + for i in cookies.split("; "): + dic_cookies[i.split("=")[0]] = i.split("=")[1] + + + urllst = get_outer_urls(10) #获取前10页的网址 + u1 = urllst[0] #获取第一页的全部30个视频url + url_inter = get_inter_urls(u1,dic_headers,dic_cookies) #获取第一个视频的url + + ###记录弹幕总数 + count = 0 + + ###避免异常 + m=0 + + for u in url_inter: + if m!=0: + m+=1 + if m==5: + m=0 + continue + m+=1 + count += get_data(u,dic_headers,dic_cookies) + print(f'数据采集并存入成功,总共采集{count}条数据') + +###修改词云图形 +img=imageio.imread('D:/2.png') + +f1=open('D:/弹幕.txt','r',encoding='UTF-8') +text=f1.read() + +text_list=jieba.lcut(text) +#print(text_list) +text_str=' '.join(text_list) + +wc=wordcloud.WordCloud( + ###设置词云图参数 + width=1000, + height=800, + background_color='white', + mask=img, + font_path='C:\Windows\Fonts\FZSTK.TTF',###字体 + stopwords={'了','的','是','我','啊','就','这','你','也','有','都','吗'}###屏蔽词 +) + +wc.generate(text_str) +wc.to_file('D:/词云3.png') + +f1.close() + +