From f0b8ae150ecc1369ac1ceecf573c90f84332d2b7 Mon Sep 17 00:00:00 2001 From: pfc8hp2r6 <2317678682@qq.com> Date: Wed, 18 Sep 2024 19:05:58 +0800 Subject: [PATCH] ADD file via upload --- danmupapa.py | 222 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 222 insertions(+) create mode 100644 danmupapa.py diff --git a/danmupapa.py b/danmupapa.py new file mode 100644 index 0000000..6da3ce8 --- /dev/null +++ b/danmupapa.py @@ -0,0 +1,222 @@ +import requests +import re +import pandas as pd +import json +import time +import openpyxl +from openpyxl import Workbook +import matplotlib.pyplot as plt + +# 设置Matplotlib的字体 +plt.rcParams['font.sans-serif'] = ['SimHei'] # 使用黑体 +plt.rcParams['axes.unicode_minus'] = False # 正确显示负号 + +def count_keywords(danmu_data, keywords): + keyword_counts = {keyword: 0 for keyword in keywords} + for danmu in danmu_data: + for keyword in keywords: + keyword_counts[keyword] += danmu.count(keyword) + return keyword_counts + +# 数据可视化 +def visualize_data(keyword_counts): + plt.bar(keyword_counts.keys(), keyword_counts.values()) + plt.xlabel('Keywords') + plt.ylabel('Counts') + plt.title('Keyword Counts in Danmu Data') + plt.show() + + + +SEARCH_API_URL ='https://search.bilibili.com/all?keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.788&search_source=2' + +headers = {'User-Agent': '"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', + 'Cookie': 'buvid3=37CAA054-A8A1-787E-CD23-05D8BC9EEB4780230infoc; b_nut=1725113280; _uuid=1C1E246E-F3B6-D13B-106CD-A109D35CB4D9B67205infoc; enable_web_push=DISABLE; buvid4=C1C36600-6C10-0893-8E67-0999E6D18DC620055-024070815-r8KbDAvfSCu8tCCZ7llkuA%3D%3D; CURRENT_FNVAL=4048; rpdid=|(u))kJumk~~0J\'u~kluuJk)l; header_theme_version=CLOSE; fingerprint=13c5a0300ec32b6c6d2a5a81a8682b18; buvid_fp_plain=undefined; buvid_fp=13c5a0300ec32b6c6d2a5a81a8682b18; bp_t_offset_512104208=975890211160457216; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjYzMjExMjksImlhdCI6MTcyNjA2MTg2OSwicGx0IjotMX0.wd7VeJfGfUQR0Ofh0z5fnmQ1CpNKEjECYT5nAJWTgEQ; bili_ticket_expires=1726321069; bp_t_offset_3546760311998987=976235500761251840; b_lsid=81077410AF_191E6741DE6; bsource=search_baidu; home_feed_column=5; browser_resolution=2100-1095; SESSDATA=7e54226a%2C1741705429%2C85264%2A92CjC-dRoO18jeyJJLwwGvwJs00QhNm1jjKoflnsViX87MSQJOgo3T2MRFIWOTYe5GVuASVnd4YVRLa0NKN2NDWTdBVFZWb2hYR0M2RXBoTnRyeTliNnptVlFJWWhYWnZtam9pNmpNcm54Q00ybzNUenVxSng5UC1hY2VyYUQ5Y0U3czQ3NHRGOEFBIIEC; bili_jct=87feb43d0b6b672ca3d06eca43c442b1; DedeUserID=512104208; DedeUserID__ckMd5=d02f77dab719b614; sid=q6cirm22', + 'Origin': 'https: // search.bilibili.com', + 'Referer': 'https://www.bilibili.com/video' + + } + +response = requests.get(url=SEARCH_API_URL, headers=headers) +##print(response.text) +hrefs = [ +response.text + +] + +# 定义正则表达式模式 +pattern = r'video/(BV\w+)' + +# 提取所有匹配项 +all_bv_numbers = [] +for href in hrefs: + matches = re.findall(pattern, href) + all_bv_numbers.extend(matches) + #all_bv_numbers=set(all_bv_numbers) +#print("所有匹配的BV号:", all_bv_numbers) +for a in range(2,12): + b=30*a-30 + SEARCH_API_URL = 'https://search.bilibili.com/all?keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.788&search_source=2'+str(a)+'&o='+str(b) + + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.5845.97 Safari/537.36 Core/1.116.438.400 QQBrowser/13.0.6070.400', + 'Cookie': 'buvid3=37CAA054-A8A1-787E-CD23-05D8BC9EEB4780230infoc; b_nut=1725113280; _uuid=1C1E246E-F3B6-D13B-106CD-A109D35CB4D9B67205infoc; enable_web_push=DISABLE; buvid4=C1C36600-6C10-0893-8E67-0999E6D18DC620055-024070815-r8KbDAvfSCu8tCCZ7llkuA%3D%3D; CURRENT_FNVAL=4048; rpdid=|(u))kJumk~~0J\'u~kluuJk)l; header_theme_version=CLOSE; fingerprint=13c5a0300ec32b6c6d2a5a81a8682b18; buvid_fp_plain=undefined; buvid_fp=13c5a0300ec32b6c6d2a5a81a8682b18; bp_t_offset_512104208=975890211160457216; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjYzMjExMjksImlhdCI6MTcyNjA2MTg2OSwicGx0IjotMX0.wd7VeJfGfUQR0Ofh0z5fnmQ1CpNKEjECYT5nAJWTgEQ; bili_ticket_expires=1726321069; bp_t_offset_3546760311998987=976235500761251840; b_lsid=81077410AF_191E6741DE6; bsource=search_baidu; home_feed_column=5; browser_resolution=2100-1095; SESSDATA=7e54226a%2C1741705429%2C85264%2A92CjC-dRoO18jeyJJLwwGvwJs00QhNm1jjKoflnsViX87MSQJOgo3T2MRFIWOTYe5GVuASVnd4YVRLa0NKN2NDWTdBVFZWb2hYR0M2RXBoTnRyeTliNnptVlFJWWhYWnZtam9pNmpNcm54Q00ybzNUenVxSng5UC1hY2VyYUQ5Y0U3czQ3NHRGOEFBIIEC; bili_jct=87feb43d0b6b672ca3d06eca43c442b1; DedeUserID=512104208; DedeUserID__ckMd5=d02f77dab719b614; sid=q6cirm22', + + } + response = requests.get(url=SEARCH_API_URL, headers=headers) + ##print(response.text) + hrefs = [ + response.text + + ] + + # 定义正则表达式模式 + pattern = r'video/(BV\w+)' + + # 提取所有匹配项 + + for href in hrefs: + matches = re.findall(pattern, href) + all_bv_numbers.extend(matches) + + + + a+=1 +all_bv_numbers = set(all_bv_numbers) +print("所有匹配的BV号:", all_bv_numbers) + +url_bag = [] + +# 根据oid请求弹幕,解析弹幕得到最终的数据\ +for bvv in all_bv_numbers: + + + url_bag.append(str('https://api.bilibili.com/x/player/pagelist?bvid='+str(bvv)+'&jsonp=jsonp')) + + + # 清理请求头中的非 ASCII 字符 +def clean_header(header): + return ''.join([c if ord(c) < 128 else '' for c in header]) +cidd=[] +print('working.....') +for url in url_bag: + + headers={ + 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', + 'Referer': 'https://www.bilibili.com/video', + 'Cookie': 'buvid3=37CAA054-A8A1-787E-CD23-05D8BC9EEB4780230infoc; b_nut=1725113280; _uuid=1C1E246E-F3B6-D13B-106CD-A109D35CB4D9B67205infoc; enable_web_push=DISABLE; buvid4=C1C36600-6C10-0893-8E67-0999E6D18DC620055-024070815-r8KbDAvfSCu8tCCZ7llkuA%3D%3D; CURRENT_FNVAL=4048; rpdid=|(u))kJumk~~0J\'u~kluuJk)l; header_theme_version=CLOSE; fingerprint=13c5a0300ec32b6c6d2a5a81a8682b18; buvid_fp_plain=undefined; buvid_fp=13c5a0300ec32b6c6d2a5a81a8682b18; bp_t_offset_512104208=976311809445199872; home_feed_column=4; bp_t_offset_3546760311998987=977044801448837120; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY3NTkwNTIsImlhdCI6MTcyNjQ5OTc5MiwicGx0IjotMX0.nxZ5VaGrzO5b6-53keyv-cFkgHJHJc3AlsstvpFLRqU; bili_ticket_expires=1726758992; b_lsid=FEC2F3BD_19204AAF252; bsource=search_baidu; browser_resolution=1396-753; SESSDATA=248d492e%2C1742208568%2C3b9b3%2A92CjAlo1XIj642R8msLp1dRmDFhrz5qww2VpfuVKpIfW1YPu6THL88hGTut7JX0fhZ3ScSVmFHcktRWnprQnVVWmVKOU13UXU3QTd3ZVV5SDJjQUh1VnQ5dEREZlhOanhQZHhhendBWF9oVlo3WVBkamFCemNvajVOLWpORGlOdlNBT0hhejlzUjd3IIEC; bili_jct=b7b93ee0674805b95693a57e794dec38; DedeUserID=16908229; DedeUserID__ckMd5=a77e39913b0a23e9; sid=7iszo700', + } + + try: + response = requests.get(url=url, headers=headers) + response.raise_for_status() # 检查请求是否成功 + # 打印响应文本 + json_dict = json.loads(response.text) + cid = json_dict["data"][0]["cid"] + + cidd.append(cid) + except requests.RequestException as e: + print(f"请求错误: {e}") + except json.JSONDecodeError as e: + print(f"JSON解码错误: {e}") + except KeyError as e: + print(f"键错误: {e}, 可能是数据结构不符合预期") + except UnicodeEncodeError as e: + print(f"编码错误: {e}") + + + # 增加请求间隔 + +print("cid 获取好了!") +print("接下来是一段较为长时间的数据解析过程,请稍作等待...") + +cnt=0 +result = open('total_data.xls', 'w', encoding='utf-8') +ciyun_file=open('danmu.txt','w',encoding='utf-8') +workbook = openpyxl.Workbook() + +# 获取默认的工作表 +sheet = workbook.active + +# 写入数据 +sheet['A1'] = 'data' +key_dataa=[] +for data in cidd: + url = 'https://api.bilibili.com/x/v2/dm/web/history/seg.so?type=1&oid='+str(data)+'&date=2024-09-06' + + headers = {'User-Agent': '"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', + 'Referer': 'https://www.bilibili.com/video', + 'Cookie': 'buvid3=37CAA054-A8A1-787E-CD23-05D8BC9EEB4780230infoc; b_nut=1725113280; _uuid=1C1E246E-F3B6-D13B-106CD-A109D35CB4D9B67205infoc; enable_web_push=DISABLE; buvid4=C1C36600-6C10-0893-8E67-0999E6D18DC620055-024070815-r8KbDAvfSCu8tCCZ7llkuA%3D%3D; CURRENT_FNVAL=4048; rpdid=|(u))kJumk~~0J\'u~kluuJk)l; header_theme_version=CLOSE; fingerprint=13c5a0300ec32b6c6d2a5a81a8682b18; buvid_fp_plain=undefined; buvid_fp=13c5a0300ec32b6c6d2a5a81a8682b18; bp_t_offset_512104208=976311809445199872; home_feed_column=4; bp_t_offset_3546760311998987=977044801448837120; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY3NTkwNTIsImlhdCI6MTcyNjQ5OTc5MiwicGx0IjotMX0.nxZ5VaGrzO5b6-53keyv-cFkgHJHJc3AlsstvpFLRqU; bili_ticket_expires=1726758992; b_lsid=FEC2F3BD_19204AAF252; bsource=search_baidu; browser_resolution=1396-753; SESSDATA=248d492e%2C1742208568%2C3b9b3%2A92CjAlo1XIj642R8msLp1dRmDFhrz5qww2VpfuVKpIfW1YPu6THL88hGTut7JX0fhZ3ScSVmFHcktRWnprQnVVWmVKOU13UXU3QTd3ZVV5SDJjQUh1VnQ5dEREZlhOanhQZHhhendBWF9oVlo3WVBkamFCemNvajVOLWpORGlOdlNBT0hhejlzUjd3IIEC; bili_jct=b7b93ee0674805b95693a57e794dec38; DedeUserID=16908229; DedeUserID__ckMd5=a77e39913b0a23e9; sid=7iszo700', + + } + response = requests.get(url=url, headers=headers) + + mama=response.text + + + + pattern1 = ':([^@]*)@' + + dataa = re.findall(pattern1, response.text) + + keyword = '你' + + # 提取含有关键词的元素 + filtered_elements = [] + key_dataa=key_dataa+dataa + for a in dataa: + #去除违法字符 + illegal_chars = ['\x00', '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07', + '\x08', '\x0b', '\x0c', '\x0e', '\x0f', '\x10', '\x11', '\x12', + '\x13', '\x14', '\x15', '\x16', '\x17', '\x18', '\x19', '\x1a', + '\x1b', '\x1c', '\x1d', '\x1e', '\x1f', '\x7f'] + for char in illegal_chars: + a = a.replace(char, '.') + + + a = str(a) + sheet.append([a]) + ciyun_file.write(a) + time.sleep(1) +#完成图表 +keywords = ['全红婵', '潘展乐'] +keyword_counts = count_keywords(key_dataa, keywords) + +visualize_data(keyword_counts) +workbook.save('total_data.xls') +keywords = ['ai','智能','AI','人工'] + +# 读取原始表格数据 +df = pd.read_excel('total_data.xls') + +# 检查每一行是否含有关键字 +pattern = r'(?