diff --git a/danmupapa.py b/danmupapa.py deleted file mode 100644 index cc17e1b..0000000 --- a/danmupapa.py +++ /dev/null @@ -1,219 +0,0 @@ -import requests -import re -import pandas as pd -import json -import time -import openpyxl -from openpyxl import Workbook -import matplotlib.pyplot as plt - -# 设置Matplotlib的字体 -plt.rcParams['font.sans-serif'] = ['SimHei'] # 使用黑体 - - -def count_keywords(danmu_data, keywords): - keyword_counts = {keyword: 0 for keyword in keywords} - for danmu in danmu_data: - for keyword in keywords: - keyword_counts[keyword] += danmu.count(keyword) - return keyword_counts - -# 数据可视化 -def visualize_data(keyword_counts): - plt.bar(keyword_counts.keys(), keyword_counts.values()) - plt.xlabel('Keywords') - plt.ylabel('Counts') - plt.title('Keyword Counts in Danmu Data') - plt.show() - - - -SEARCH_API_URL ='https://search.bilibili.com/all?keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.788&search_source=2' - -headers = {'User-Agent': '"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', - 'Cookie': 'buvid3=37CAA054-A8A1-787E-CD23-05D8BC9EEB4780230infoc; b_nut=1725113280; _uuid=1C1E246E-F3B6-D13B-106CD-A109D35CB4D9B67205infoc; enable_web_push=DISABLE; buvid4=C1C36600-6C10-0893-8E67-0999E6D18DC620055-024070815-r8KbDAvfSCu8tCCZ7llkuA%3D%3D; CURRENT_FNVAL=4048; rpdid=|(u))kJumk~~0J\'u~kluuJk)l; header_theme_version=CLOSE; fingerprint=13c5a0300ec32b6c6d2a5a81a8682b18; buvid_fp_plain=undefined; buvid_fp=13c5a0300ec32b6c6d2a5a81a8682b18; bp_t_offset_512104208=975890211160457216; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjYzMjExMjksImlhdCI6MTcyNjA2MTg2OSwicGx0IjotMX0.wd7VeJfGfUQR0Ofh0z5fnmQ1CpNKEjECYT5nAJWTgEQ; bili_ticket_expires=1726321069; bp_t_offset_3546760311998987=976235500761251840; b_lsid=81077410AF_191E6741DE6; bsource=search_baidu; home_feed_column=5; browser_resolution=2100-1095; SESSDATA=7e54226a%2C1741705429%2C85264%2A92CjC-dRoO18jeyJJLwwGvwJs00QhNm1jjKoflnsViX87MSQJOgo3T2MRFIWOTYe5GVuASVnd4YVRLa0NKN2NDWTdBVFZWb2hYR0M2RXBoTnRyeTliNnptVlFJWWhYWnZtam9pNmpNcm54Q00ybzNUenVxSng5UC1hY2VyYUQ5Y0U3czQ3NHRGOEFBIIEC; bili_jct=87feb43d0b6b672ca3d06eca43c442b1; DedeUserID=512104208; DedeUserID__ckMd5=d02f77dab719b614; sid=q6cirm22', - 'Origin': 'https: // search.bilibili.com', - 'Referer': 'https://www.bilibili.com/video' - - } - -response = requests.get(url=SEARCH_API_URL, headers=headers) -##print(response.text) -hrefs = [ -response.text - -] - -# 定义正则表达式模式 -pattern = r'video/(BV\w+)' - -# 提取所有匹配项 -all_bv_numbers = [] -for href in hrefs: - matches = re.findall(pattern, href) - all_bv_numbers.extend(matches) - #all_bv_numbers=set(all_bv_numbers) -#print("所有匹配的BV号:", all_bv_numbers) -for a in range(2,12): - b=30*a-30 - SEARCH_API_URL = 'https://search.bilibili.com/all?keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.788&search_source=2'+str(a)+'&o='+str(b) - - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.5845.97 Safari/537.36 Core/1.116.438.400 QQBrowser/13.0.6070.400', - 'Cookie': 'buvid3=37CAA054-A8A1-787E-CD23-05D8BC9EEB4780230infoc; b_nut=1725113280; _uuid=1C1E246E-F3B6-D13B-106CD-A109D35CB4D9B67205infoc; enable_web_push=DISABLE; buvid4=C1C36600-6C10-0893-8E67-0999E6D18DC620055-024070815-r8KbDAvfSCu8tCCZ7llkuA%3D%3D; CURRENT_FNVAL=4048; rpdid=|(u))kJumk~~0J\'u~kluuJk)l; header_theme_version=CLOSE; fingerprint=13c5a0300ec32b6c6d2a5a81a8682b18; buvid_fp_plain=undefined; buvid_fp=13c5a0300ec32b6c6d2a5a81a8682b18; bp_t_offset_512104208=975890211160457216; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjYzMjExMjksImlhdCI6MTcyNjA2MTg2OSwicGx0IjotMX0.wd7VeJfGfUQR0Ofh0z5fnmQ1CpNKEjECYT5nAJWTgEQ; bili_ticket_expires=1726321069; bp_t_offset_3546760311998987=976235500761251840; b_lsid=81077410AF_191E6741DE6; bsource=search_baidu; home_feed_column=5; browser_resolution=2100-1095; SESSDATA=7e54226a%2C1741705429%2C85264%2A92CjC-dRoO18jeyJJLwwGvwJs00QhNm1jjKoflnsViX87MSQJOgo3T2MRFIWOTYe5GVuASVnd4YVRLa0NKN2NDWTdBVFZWb2hYR0M2RXBoTnRyeTliNnptVlFJWWhYWnZtam9pNmpNcm54Q00ybzNUenVxSng5UC1hY2VyYUQ5Y0U3czQ3NHRGOEFBIIEC; bili_jct=87feb43d0b6b672ca3d06eca43c442b1; DedeUserID=512104208; DedeUserID__ckMd5=d02f77dab719b614; sid=q6cirm22', - - } - response = requests.get(url=SEARCH_API_URL, headers=headers) - ##print(response.text) - hrefs = [ - response.text - - ] - - # 定义正则表达式模式 - pattern = r'video/(BV\w+)' - - # 提取所有匹配项 - - for href in hrefs: - matches = re.findall(pattern, href) - all_bv_numbers.extend(matches) - - - - a+=1 -all_bv_numbers = set(all_bv_numbers) -print("所有匹配的BV号:", all_bv_numbers) - -url_bag = [] - -# 根据oid请求弹幕,解析弹幕得到最终的数据\ -for bvv in all_bv_numbers: - - - url_bag.append(str('https://api.bilibili.com/x/player/pagelist?bvid='+str(bvv)+'&jsonp=jsonp')) - - - # 清理请求头中的非 ASCII 字符 -def clean_header(header): - return ''.join([c if ord(c) < 128 else '' for c in header]) -cidd=[] -print('working.....') -for url in url_bag: - - headers={ - 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', - 'Referer': 'https://www.bilibili.com/video', - 'Cookie': 'buvid3=37CAA054-A8A1-787E-CD23-05D8BC9EEB4780230infoc; b_nut=1725113280; _uuid=1C1E246E-F3B6-D13B-106CD-A109D35CB4D9B67205infoc; enable_web_push=DISABLE; buvid4=C1C36600-6C10-0893-8E67-0999E6D18DC620055-024070815-r8KbDAvfSCu8tCCZ7llkuA%3D%3D; CURRENT_FNVAL=4048; rpdid=|(u))kJumk~~0J\'u~kluuJk)l; header_theme_version=CLOSE; fingerprint=13c5a0300ec32b6c6d2a5a81a8682b18; buvid_fp_plain=undefined; buvid_fp=13c5a0300ec32b6c6d2a5a81a8682b18; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjYzMjExMjksImlhdCI6MTcyNjA2MTg2OSwicGx0IjotMX0.wd7VeJfGfUQR0Ofh0z5fnmQ1CpNKEjECYT5nAJWTgEQ; bili_ticket_expires=1726321069; bsource=search_baidu; home_feed_column=5; browser_resolution=2100-1095; b_lsid=2C2F5DE8_191E6C6B17F; bp_t_offset_512104208=976311809445199872; SESSDATA=64f9af2c%2C1741705746%2C252bc%2A92CjBco6PCEf5jHGOtLwBCnpRRhnXcR0SL850C4F5X8GK2eVczaFKXrWQv4b7zWBkS77cSVmtEdktONDd2RUx0NTQybzAwZTdlOU1jNGlvbnZuN0YzU3VOZFBPZVBWMEVqWFN5a3hBdlVYTWkwaXZQTndCR3FQUkVIT1N1elpCNUI5UFZPc2JTYzNnIIEC; bili_jct=0dbb281cc47245a2970662a9f04112ea; DedeUserID=3546760311998987; DedeUserID__ckMd5=945a443ae2d0a983; sid=8uthyfdc; bp_t_offset_3546760311998987=976312754338004992', - } - - try: - response = requests.get(url=url, headers=headers) - response.raise_for_status() # 检查请求是否成功 - # 打印响应文本 - json_dict = json.loads(response.text) - cid = json_dict["data"][0]["cid"] - - cidd.append(cid) - except requests.RequestException as e: - print(f"请求错误: {e}") - except json.JSONDecodeError as e: - print(f"JSON解码错误: {e}") - except KeyError as e: - print(f"键错误: {e}, 可能是数据结构不符合预期") - except UnicodeEncodeError as e: - print(f"编码错误: {e}") - time.sleep(2) - - # 增加请求间隔 - -print("cid 获取好了!") -print("接下来是一段较为长时间的数据解析过程,请稍作等待...") - -cnt=0 -result = open('total_data.xls', 'w', encoding='utf-8') -ciyun_file=open('danmu.txt','w',encoding='utf-8') -workbook = openpyxl.Workbook() - -# 获取默认的工作表 -sheet = workbook.active - -# 写入数据 -sheet['A1'] = 'data' -key_dataa=[] -for data in cidd: - url = 'https://api.bilibili.com/x/v2/dm/web/history/seg.so?type=1&oid='+str(data)+'&date=2024-09-06' - - headers = {'User-Agent': '"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', - 'Referer': 'https://www.bilibili.com/video', - 'Cookie': 'buvid3=37CAA054-A8A1-787E-CD23-05D8BC9EEB4780230infoc; b_nut=1725113280; _uuid=1C1E246E-F3B6-D13B-106CD-A109D35CB4D9B67205infoc; enable_web_push=DISABLE; buvid4=C1C36600-6C10-0893-8E67-0999E6D18DC620055-024070815-r8KbDAvfSCu8tCCZ7llkuA%3D%3D; CURRENT_FNVAL=4048; rpdid=|(u))kJumk~~0J\'u~kluuJk)l; header_theme_version=CLOSE; fingerprint=13c5a0300ec32b6c6d2a5a81a8682b18; buvid_fp_plain=undefined; buvid_fp=13c5a0300ec32b6c6d2a5a81a8682b18; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjYzMjExMjksImlhdCI6MTcyNjA2MTg2OSwicGx0IjotMX0.wd7VeJfGfUQR0Ofh0z5fnmQ1CpNKEjECYT5nAJWTgEQ; bili_ticket_expires=1726321069; bp_t_offset_3546760311998987=976235500761251840; bsource=search_baidu; home_feed_column=5; browser_resolution=2100-1095; b_lsid=2C2F5DE8_191E6C6B17F; bp_t_offset_512104208=976311809445199872; SESSDATA=64f9af2c%2C1741705746%2C252bc%2A92CjBco6PCEf5jHGOtLwBCnpRRhnXcR0SL850C4F5X8GK2eVczaFKXrWQv4b7zWBkS77cSVmtEdktONDd2RUx0NTQybzAwZTdlOU1jNGlvbnZuN0YzU3VOZFBPZVBWMEVqWFN5a3hBdlVYTWkwaXZQTndCR3FQUkVIT1N1elpCNUI5UFZPc2JTYzNnIIEC; bili_jct=0dbb281cc47245a2970662a9f04112ea; DedeUserID=3546760311998987; DedeUserID__ckMd5=945a443ae2d0a983; sid=8uthyfdc', - - } - response = requests.get(url=url, headers=headers) - - mama=response.text - - - pattern1 = ':([^@]*)@' - - dataa = re.findall(pattern1, response.text) - keyword = '你' - - # 提取含有关键词的元素 - filtered_elements = [] - key_dataa=key_dataa+dataa - for a in dataa: - #去除违法字符 - illegal_chars = ['\x00', '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07', - '\x08', '\x0b', '\x0c', '\x0e', '\x0f', '\x10', '\x11', '\x12', - '\x13', '\x14', '\x15', '\x16', '\x17', '\x18', '\x19', '\x1a', - '\x1b', '\x1c', '\x1d', '\x1e', '\x1f', '\x7f'] - for char in illegal_chars: - a = a.replace(char, '.') - - - a = str(a) - sheet.append([a]) - ciyun_file.write(a) -#完成图表 -keywords = ['全红婵', '潘展乐'] -keyword_counts = count_keywords(key_dataa, keywords) - -visualize_data(keyword_counts) -workbook.save('total_data.xls') -keywords = ['ai','智能','AI','人工'] - -# 读取原始表格数据 -df = pd.read_excel('total_data.xls') - -# 检查每一行是否含有关键字 -pattern = r'(?