From 146555172efe5f2488118258e7ecbc156d05f61a Mon Sep 17 00:00:00 2001 From: p7zwxau5j <904679083@qq.com> Date: Mon, 16 Sep 2024 00:22:14 +0800 Subject: [PATCH] =?UTF-8?q?Delete=20'=E5=BC=B9=E5=B9=95-=E7=88=AC=E8=99=AB?= =?UTF-8?q?-=E6=95=B0=E6=8D=AE=E5=A4=84=E7=90=86-=E8=AF=8D=E4=BA=91?= =?UTF-8?q?=E5=9B=BE.py'?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 弹幕-爬虫-数据处理-词云图.py | 200 ------------------------ 1 file changed, 200 deletions(-) delete mode 100644 弹幕-爬虫-数据处理-词云图.py diff --git a/弹幕-爬虫-数据处理-词云图.py b/弹幕-爬虫-数据处理-词云图.py deleted file mode 100644 index 923037f..0000000 --- a/弹幕-爬虫-数据处理-词云图.py +++ /dev/null @@ -1,200 +0,0 @@ -import requests -import re -import pandas as pds -import time -import jieba -import matplotlib.pyplot as plt -from wordcloud import WordCloud -from PIL import Image -from bs4 import BeautifulSoup -from collections import Counter -from openpyxl import Workbook - -# 定义全局变量 -bvid_list = [] # 存储BV号的列表 -oid_list = [] # 存储OID号的列表 -content_list = [] # 存储弹幕内容的列表 -s = set([]) # 存储不重复的BV号集合 -count = 300 # 爬取视频数量 -pages = 15 # 爬取网页页数 - - -#爬取bv号 -def bvid_get(page,count): - try: - for i in range(1,page): # 爬取指定页数 - url = f'https://search.bilibili.com/video?keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=3&page={i}' - headers = { - 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', - "cookie": "buvid4=B85AEC97-F69C-C637-E3C2-C663D774232B87096-022081317-Mk4wjKcJQ45KiYrEq7vrds65fs3pUAqkDfUodWgIWmefZ%2B8Gkn9QRg%3D%3D; CURRENT_FNVAL=4048; buvid_fp=ae1a9ed67ce4b4b81601b1428b3217f9; rpdid=|(k|lk~umYk)0J'u~um~Y|lku; SESSDATA=19c9c747%2C1735394649%2Cde0b5%2A72CjCGWwVQmSRSWbmZZg7l2IVicrrImP6KbqADlDdybFU35A2pNwxPiQvGuSWVPFCvg4sSVjNCOVZTU3ZjeXhGcTYwc1RZSWo1U2E2clc2WkJwdE0zREh2SnRnaldDcGhDUTZOVnF2TVdIWVZvVXBuNnJGeWs5MlhvNFdraGZpaGFwbVNvSXlWSERRIIEC; bili_jct=b7b181986a9d1a649b30eb2fa5929d52; DedeUserID=1604567014; DedeUserID__ckMd5=e19ded6239c7cd47; buvid3=3D25DBF7-1426-D2BB-E102-DB9D3B6CE68458040infoc; b_nut=1719933858; enable_web_push=DISABLE; header_theme_version=CLOSE; LIVE_BUVID=AUTO8417216581143780; fingerprint=4a6a977133f772625df973e6fe4b955c; bp_video_offset_1604567014=968870267503771648; hit-dyn-v2=1; CURRENT_QUALITY=80; _uuid=DEA107613-17109-A8AA-9E27-E8BA6FBFC103E89699infoc; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjU3OTUyODcsImlhdCI6MTcyNTUzNjAyNywicGx0IjotMX0.4xQbg9vPDx7yeSRK4xst2CzX32eMjPSMdlfFyeUn6_g; bili_ticket_expires=1725795227; PVID=8; bp_t_offset_1604567014=974079272559837184; b_lsid=351B10593_191C81CD95D; home_feed_column=4; browser_resolution=978-740; sid=6zbnnd1f" - } - response = requests.get(url=url, headers=headers) - response.encoding = 'utf-8' - #print(response.text) - temp=re.findall(r'bvid:"(.*?)"',response.text) - #print(temp) - for i in temp: - s.add(i) - if len(bvid_list)>=count: - break - if (len(s)>=count): - bvid_list.extend(list(s)) - break - #print(bvid_list) - print("爬取",len(bvid_list),"个视频的bvid") - except requests.RequestException as e: - print(e) - - - -#爬取视频oid号 -def oid_get(): - try: - for i in bvid_list: - url = f'https://api.bilibili.com/x/player/pagelist?bvid={i}&jsonp=jsonp' - headers = { - 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', - "cookie": "buvid4=B85AEC97-F69C-C637-E3C2-C663D774232B87096-022081317-Mk4wjKcJQ45KiYrEq7vrds65fs3pUAqkDfUodWgIWmefZ%2B8Gkn9QRg%3D%3D; CURRENT_FNVAL=4048; buvid_fp=ae1a9ed67ce4b4b81601b1428b3217f9; rpdid=|(k|lk~umYk)0J'u~um~Y|lku; SESSDATA=19c9c747%2C1735394649%2Cde0b5%2A72CjCGWwVQmSRSWbmZZg7l2IVicrrImP6KbqADlDdybFU35A2pNwxPiQvGuSWVPFCvg4sSVjNCOVZTU3ZjeXhGcTYwc1RZSWo1U2E2clc2WkJwdE0zREh2SnRnaldDcGhDUTZOVnF2TVdIWVZvVXBuNnJGeWs5MlhvNFdraGZpaGFwbVNvSXlWSERRIIEC; bili_jct=b7b181986a9d1a649b30eb2fa5929d52; DedeUserID=1604567014; DedeUserID__ckMd5=e19ded6239c7cd47; buvid3=3D25DBF7-1426-D2BB-E102-DB9D3B6CE68458040infoc; b_nut=1719933858; enable_web_push=DISABLE; header_theme_version=CLOSE; LIVE_BUVID=AUTO8417216581143780; fingerprint=4a6a977133f772625df973e6fe4b955c; bp_video_offset_1604567014=968870267503771648; hit-dyn-v2=1; CURRENT_QUALITY=80; _uuid=DEA107613-17109-A8AA-9E27-E8BA6FBFC103E89699infoc; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjU3OTUyODcsImlhdCI6MTcyNTUzNjAyNywicGx0IjotMX0.4xQbg9vPDx7yeSRK4xst2CzX32eMjPSMdlfFyeUn6_g; bili_ticket_expires=1725795227; PVID=8; bp_t_offset_1604567014=974079272559837184; b_lsid=351B10593_191C81CD95D; home_feed_column=4; browser_resolution=978-740; sid=6zbnnd1f" - } - response = requests.get(url=url, headers=headers) - response.encoding = 'utf-8' - #print(response.text) - oid_list.extend(re.findall(r'"cid":(.*?),',response.text)) - #print(oid_list) - #print('爬取',len(oid_list),'个视频的oid') - except requests.RequestException as e: - print(e) - - - -#爬取弹幕 -def danmu_get(): - try: - for i in oid_list: - #print(i) - url = f'https://api.bilibili.com/x/v1/dm/list.so?oid={i}' - headers = { - 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', - "cookie": "buvid4=B85AEC97-F69C-C637-E3C2-C663D774232B87096-022081317-Mk4wjKcJQ45KiYrEq7vrds65fs3pUAqkDfUodWgIWmefZ%2B8Gkn9QRg%3D%3D; CURRENT_FNVAL=4048; buvid_fp=ae1a9ed67ce4b4b81601b1428b3217f9; rpdid=|(k|lk~umYk)0J'u~um~Y|lku; SESSDATA=19c9c747%2C1735394649%2Cde0b5%2A72CjCGWwVQmSRSWbmZZg7l2IVicrrImP6KbqADlDdybFU35A2pNwxPiQvGuSWVPFCvg4sSVjNCOVZTU3ZjeXhGcTYwc1RZSWo1U2E2clc2WkJwdE0zREh2SnRnaldDcGhDUTZOVnF2TVdIWVZvVXBuNnJGeWs5MlhvNFdraGZpaGFwbVNvSXlWSERRIIEC; bili_jct=b7b181986a9d1a649b30eb2fa5929d52; DedeUserID=1604567014; DedeUserID__ckMd5=e19ded6239c7cd47; buvid3=3D25DBF7-1426-D2BB-E102-DB9D3B6CE68458040infoc; b_nut=1719933858; enable_web_push=DISABLE; header_theme_version=CLOSE; LIVE_BUVID=AUTO8417216581143780; fingerprint=4a6a977133f772625df973e6fe4b955c; bp_video_offset_1604567014=968870267503771648; hit-dyn-v2=1; CURRENT_QUALITY=80; _uuid=DEA107613-17109-A8AA-9E27-E8BA6FBFC103E89699infoc; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjU3OTUyODcsImlhdCI6MTcyNTUzNjAyNywicGx0IjotMX0.4xQbg9vPDx7yeSRK4xst2CzX32eMjPSMdlfFyeUn6_g; bili_ticket_expires=1725795227; PVID=8; bp_t_offset_1604567014=974079272559837184; b_lsid=351B10593_191C81CD95D; home_feed_column=4; browser_resolution=978-740; sid=6zbnnd1f" - } - response = requests.get(url=url, headers=headers) - response.encoding = 'utf-8' - #print(response.text) - for j in re.findall(r'(.*?)',response.text): - content_list.append(j) - #print(content_list) - - time.sleep(0.5) - - print("弹幕数量",len(content_list)) - except requests.RequestException as e: - print(e) - -#生成弹幕txt -def create_txt(): - for content in content_list: - with open('弹幕567.txt', mode='a', encoding='utf-8') as f: - f.write(content) - f.write('\n') - #print(content) - -#生成弹幕excel -def create_excel(): - writeData = { - '弹幕': content_list - } - fwrite = pds.DataFrame(writeData) - fwrite.to_excel("./弹幕567.xlsx", index=False) - -#寻找前8弹幕 -def top_eight_danmu(): - counter = Counter(content_list) - - # 获取频率前八的内容 - top_eight = counter.most_common(8) - # 创建一个新的Excel工作簿和工作表 - wb = Workbook() - ws = wb.active - - # 写入标题行 - ws.append(["Item", "Count"]) - - # 遍历并写入频率前八的项 - for item, count in top_eight: - ws.append([item, count]) - - # 保存工作簿到文件 - wb.save("top_eight_danmu567.xlsx") - - -#查找AI相关弹幕 -def search_AI_danmu(): - file_path = '弹幕567.xlsx' - sheet_name = 'Sheet1' - df = pds.read_excel(file_path, sheet_name=sheet_name) - - # 定义要搜索的关键词 - keyword = 'ai' - - # 筛选出包含关键词的行 - df = pds.read_excel(file_path, sheet_name=sheet_name) - - # 筛选包含关键词的行 - filtered_df = df[df['弹幕'].str.contains(keyword, na=False, case=False)] - - # 计算每种内容的出现次数(不去重,直接计数) - content_counts = filtered_df['弹幕'].value_counts() - - # 将结果转换为DataFrame,并重置索引 - content_counts_df = content_counts.reset_index() - content_counts_df.columns = ['Text', 'Total_Count'] - - # 写入新的Excel文件 - output_file_path = 'AI相关弹幕.xlsx' - content_counts_df.to_excel(output_file_path, index=False) - - print(f"包含关键词'{keyword}'的内容及其总数量已写入到'{output_file_path}'") - - -#词云图 -def create_wordcloud(): - # 需要排除的关键词列表 - exclude_words = set(['的', '了', '啊','在', '是', '我', '你', '他', '她', '它', '有', '和', '这', '那', '这个','不是','真的','我们', '你们', '他们']) - #mask = Image.open(r"D:\b站弹幕爬虫\mask.png").convert("L") - # 读取文件内容 - with open('弹幕567.txt', encoding='utf-8') as f: - txt = f.read() - - # 分词并排除指定关键词 - words = jieba.lcut(txt) - filtered_words = [word for word in words if word not in exclude_words and len(word) > 1] # 排除太短或指定的词 - string = ' '.join(filtered_words) - - # 创建词云对象 - wc = WordCloud( - width=1000, - height=700, - background_color='white', - colormap='viridis', - #mask=mask, - font_path='msyh.ttc' - ) - - # 生成词云 - wc.generate(string) - plt.imshow(wc, interpolation='bilinear') - plt.axis('off') # 不显示坐标轴 - plt.show() - # 保存词云到文件 - wc.to_file('词云567.png') - - -if __name__ == '__main__': - bvid_get(pages,count) - oid_get() - danmu_get() - create_txt() - create_excel() - top_eight_danmu() - search_AI_danmu() - create_wordcloud() \ No newline at end of file