diff --git a/弹幕-爬虫-数据处理-词云图.py b/弹幕-爬虫-数据处理-词云图.py new file mode 100644 index 0000000..923037f --- /dev/null +++ b/弹幕-爬虫-数据处理-词云图.py @@ -0,0 +1,200 @@ +import requests +import re +import pandas as pds +import time +import jieba +import matplotlib.pyplot as plt +from wordcloud import WordCloud +from PIL import Image +from bs4 import BeautifulSoup +from collections import Counter +from openpyxl import Workbook + +# 定义全局变量 +bvid_list = [] # 存储BV号的列表 +oid_list = [] # 存储OID号的列表 +content_list = [] # 存储弹幕内容的列表 +s = set([]) # 存储不重复的BV号集合 +count = 300 # 爬取视频数量 +pages = 15 # 爬取网页页数 + + +#爬取bv号 +def bvid_get(page,count): + try: + for i in range(1,page): # 爬取指定页数 + url = f'https://search.bilibili.com/video?keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=3&page={i}' + headers = { + 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', + "cookie": "buvid4=B85AEC97-F69C-C637-E3C2-C663D774232B87096-022081317-Mk4wjKcJQ45KiYrEq7vrds65fs3pUAqkDfUodWgIWmefZ%2B8Gkn9QRg%3D%3D; CURRENT_FNVAL=4048; buvid_fp=ae1a9ed67ce4b4b81601b1428b3217f9; rpdid=|(k|lk~umYk)0J'u~um~Y|lku; SESSDATA=19c9c747%2C1735394649%2Cde0b5%2A72CjCGWwVQmSRSWbmZZg7l2IVicrrImP6KbqADlDdybFU35A2pNwxPiQvGuSWVPFCvg4sSVjNCOVZTU3ZjeXhGcTYwc1RZSWo1U2E2clc2WkJwdE0zREh2SnRnaldDcGhDUTZOVnF2TVdIWVZvVXBuNnJGeWs5MlhvNFdraGZpaGFwbVNvSXlWSERRIIEC; bili_jct=b7b181986a9d1a649b30eb2fa5929d52; DedeUserID=1604567014; DedeUserID__ckMd5=e19ded6239c7cd47; buvid3=3D25DBF7-1426-D2BB-E102-DB9D3B6CE68458040infoc; b_nut=1719933858; enable_web_push=DISABLE; header_theme_version=CLOSE; LIVE_BUVID=AUTO8417216581143780; fingerprint=4a6a977133f772625df973e6fe4b955c; bp_video_offset_1604567014=968870267503771648; hit-dyn-v2=1; CURRENT_QUALITY=80; _uuid=DEA107613-17109-A8AA-9E27-E8BA6FBFC103E89699infoc; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjU3OTUyODcsImlhdCI6MTcyNTUzNjAyNywicGx0IjotMX0.4xQbg9vPDx7yeSRK4xst2CzX32eMjPSMdlfFyeUn6_g; bili_ticket_expires=1725795227; PVID=8; bp_t_offset_1604567014=974079272559837184; b_lsid=351B10593_191C81CD95D; home_feed_column=4; browser_resolution=978-740; sid=6zbnnd1f" + } + response = requests.get(url=url, headers=headers) + response.encoding = 'utf-8' + #print(response.text) + temp=re.findall(r'bvid:"(.*?)"',response.text) + #print(temp) + for i in temp: + s.add(i) + if len(bvid_list)>=count: + break + if (len(s)>=count): + bvid_list.extend(list(s)) + break + #print(bvid_list) + print("爬取",len(bvid_list),"个视频的bvid") + except requests.RequestException as e: + print(e) + + + +#爬取视频oid号 +def oid_get(): + try: + for i in bvid_list: + url = f'https://api.bilibili.com/x/player/pagelist?bvid={i}&jsonp=jsonp' + headers = { + 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', + "cookie": "buvid4=B85AEC97-F69C-C637-E3C2-C663D774232B87096-022081317-Mk4wjKcJQ45KiYrEq7vrds65fs3pUAqkDfUodWgIWmefZ%2B8Gkn9QRg%3D%3D; CURRENT_FNVAL=4048; buvid_fp=ae1a9ed67ce4b4b81601b1428b3217f9; rpdid=|(k|lk~umYk)0J'u~um~Y|lku; SESSDATA=19c9c747%2C1735394649%2Cde0b5%2A72CjCGWwVQmSRSWbmZZg7l2IVicrrImP6KbqADlDdybFU35A2pNwxPiQvGuSWVPFCvg4sSVjNCOVZTU3ZjeXhGcTYwc1RZSWo1U2E2clc2WkJwdE0zREh2SnRnaldDcGhDUTZOVnF2TVdIWVZvVXBuNnJGeWs5MlhvNFdraGZpaGFwbVNvSXlWSERRIIEC; bili_jct=b7b181986a9d1a649b30eb2fa5929d52; DedeUserID=1604567014; DedeUserID__ckMd5=e19ded6239c7cd47; buvid3=3D25DBF7-1426-D2BB-E102-DB9D3B6CE68458040infoc; b_nut=1719933858; enable_web_push=DISABLE; header_theme_version=CLOSE; LIVE_BUVID=AUTO8417216581143780; fingerprint=4a6a977133f772625df973e6fe4b955c; bp_video_offset_1604567014=968870267503771648; hit-dyn-v2=1; CURRENT_QUALITY=80; _uuid=DEA107613-17109-A8AA-9E27-E8BA6FBFC103E89699infoc; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjU3OTUyODcsImlhdCI6MTcyNTUzNjAyNywicGx0IjotMX0.4xQbg9vPDx7yeSRK4xst2CzX32eMjPSMdlfFyeUn6_g; bili_ticket_expires=1725795227; PVID=8; bp_t_offset_1604567014=974079272559837184; b_lsid=351B10593_191C81CD95D; home_feed_column=4; browser_resolution=978-740; sid=6zbnnd1f" + } + response = requests.get(url=url, headers=headers) + response.encoding = 'utf-8' + #print(response.text) + oid_list.extend(re.findall(r'"cid":(.*?),',response.text)) + #print(oid_list) + #print('爬取',len(oid_list),'个视频的oid') + except requests.RequestException as e: + print(e) + + + +#爬取弹幕 +def danmu_get(): + try: + for i in oid_list: + #print(i) + url = f'https://api.bilibili.com/x/v1/dm/list.so?oid={i}' + headers = { + 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', + "cookie": "buvid4=B85AEC97-F69C-C637-E3C2-C663D774232B87096-022081317-Mk4wjKcJQ45KiYrEq7vrds65fs3pUAqkDfUodWgIWmefZ%2B8Gkn9QRg%3D%3D; CURRENT_FNVAL=4048; buvid_fp=ae1a9ed67ce4b4b81601b1428b3217f9; rpdid=|(k|lk~umYk)0J'u~um~Y|lku; SESSDATA=19c9c747%2C1735394649%2Cde0b5%2A72CjCGWwVQmSRSWbmZZg7l2IVicrrImP6KbqADlDdybFU35A2pNwxPiQvGuSWVPFCvg4sSVjNCOVZTU3ZjeXhGcTYwc1RZSWo1U2E2clc2WkJwdE0zREh2SnRnaldDcGhDUTZOVnF2TVdIWVZvVXBuNnJGeWs5MlhvNFdraGZpaGFwbVNvSXlWSERRIIEC; bili_jct=b7b181986a9d1a649b30eb2fa5929d52; DedeUserID=1604567014; DedeUserID__ckMd5=e19ded6239c7cd47; buvid3=3D25DBF7-1426-D2BB-E102-DB9D3B6CE68458040infoc; b_nut=1719933858; enable_web_push=DISABLE; header_theme_version=CLOSE; LIVE_BUVID=AUTO8417216581143780; fingerprint=4a6a977133f772625df973e6fe4b955c; bp_video_offset_1604567014=968870267503771648; hit-dyn-v2=1; CURRENT_QUALITY=80; _uuid=DEA107613-17109-A8AA-9E27-E8BA6FBFC103E89699infoc; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjU3OTUyODcsImlhdCI6MTcyNTUzNjAyNywicGx0IjotMX0.4xQbg9vPDx7yeSRK4xst2CzX32eMjPSMdlfFyeUn6_g; bili_ticket_expires=1725795227; PVID=8; bp_t_offset_1604567014=974079272559837184; b_lsid=351B10593_191C81CD95D; home_feed_column=4; browser_resolution=978-740; sid=6zbnnd1f" + } + response = requests.get(url=url, headers=headers) + response.encoding = 'utf-8' + #print(response.text) + for j in re.findall(r'(.*?)',response.text): + content_list.append(j) + #print(content_list) + + time.sleep(0.5) + + print("弹幕数量",len(content_list)) + except requests.RequestException as e: + print(e) + +#生成弹幕txt +def create_txt(): + for content in content_list: + with open('弹幕567.txt', mode='a', encoding='utf-8') as f: + f.write(content) + f.write('\n') + #print(content) + +#生成弹幕excel +def create_excel(): + writeData = { + '弹幕': content_list + } + fwrite = pds.DataFrame(writeData) + fwrite.to_excel("./弹幕567.xlsx", index=False) + +#寻找前8弹幕 +def top_eight_danmu(): + counter = Counter(content_list) + + # 获取频率前八的内容 + top_eight = counter.most_common(8) + # 创建一个新的Excel工作簿和工作表 + wb = Workbook() + ws = wb.active + + # 写入标题行 + ws.append(["Item", "Count"]) + + # 遍历并写入频率前八的项 + for item, count in top_eight: + ws.append([item, count]) + + # 保存工作簿到文件 + wb.save("top_eight_danmu567.xlsx") + + +#查找AI相关弹幕 +def search_AI_danmu(): + file_path = '弹幕567.xlsx' + sheet_name = 'Sheet1' + df = pds.read_excel(file_path, sheet_name=sheet_name) + + # 定义要搜索的关键词 + keyword = 'ai' + + # 筛选出包含关键词的行 + df = pds.read_excel(file_path, sheet_name=sheet_name) + + # 筛选包含关键词的行 + filtered_df = df[df['弹幕'].str.contains(keyword, na=False, case=False)] + + # 计算每种内容的出现次数(不去重,直接计数) + content_counts = filtered_df['弹幕'].value_counts() + + # 将结果转换为DataFrame,并重置索引 + content_counts_df = content_counts.reset_index() + content_counts_df.columns = ['Text', 'Total_Count'] + + # 写入新的Excel文件 + output_file_path = 'AI相关弹幕.xlsx' + content_counts_df.to_excel(output_file_path, index=False) + + print(f"包含关键词'{keyword}'的内容及其总数量已写入到'{output_file_path}'") + + +#词云图 +def create_wordcloud(): + # 需要排除的关键词列表 + exclude_words = set(['的', '了', '啊','在', '是', '我', '你', '他', '她', '它', '有', '和', '这', '那', '这个','不是','真的','我们', '你们', '他们']) + #mask = Image.open(r"D:\b站弹幕爬虫\mask.png").convert("L") + # 读取文件内容 + with open('弹幕567.txt', encoding='utf-8') as f: + txt = f.read() + + # 分词并排除指定关键词 + words = jieba.lcut(txt) + filtered_words = [word for word in words if word not in exclude_words and len(word) > 1] # 排除太短或指定的词 + string = ' '.join(filtered_words) + + # 创建词云对象 + wc = WordCloud( + width=1000, + height=700, + background_color='white', + colormap='viridis', + #mask=mask, + font_path='msyh.ttc' + ) + + # 生成词云 + wc.generate(string) + plt.imshow(wc, interpolation='bilinear') + plt.axis('off') # 不显示坐标轴 + plt.show() + # 保存词云到文件 + wc.to_file('词云567.png') + + +if __name__ == '__main__': + bvid_get(pages,count) + oid_get() + danmu_get() + create_txt() + create_excel() + top_eight_danmu() + search_AI_danmu() + create_wordcloud() \ No newline at end of file