You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

223 lines
11 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import requests
import re
import pandas as pd
import json
import time
import openpyxl
from openpyxl import Workbook
import matplotlib.pyplot as plt
# 设置Matplotlib的字体
plt.rcParams['font.sans-serif'] = ['SimHei'] # 使用黑体
plt.rcParams['axes.unicode_minus'] = False # 正确显示负号
def count_keywords(danmu_data, keywords):
keyword_counts = {keyword: 0 for keyword in keywords}
for danmu in danmu_data:
for keyword in keywords:
keyword_counts[keyword] += danmu.count(keyword)
return keyword_counts
# 数据可视化
def visualize_data(keyword_counts):
plt.bar(keyword_counts.keys(), keyword_counts.values())
plt.xlabel('Keywords')
plt.ylabel('Counts')
plt.title('Keyword Counts in Danmu Data')
plt.show()
SEARCH_API_URL ='https://search.bilibili.com/all?keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.788&search_source=2'
headers = {'User-Agent': '"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Cookie': 'buvid3=37CAA054-A8A1-787E-CD23-05D8BC9EEB4780230infoc; b_nut=1725113280; _uuid=1C1E246E-F3B6-D13B-106CD-A109D35CB4D9B67205infoc; enable_web_push=DISABLE; buvid4=C1C36600-6C10-0893-8E67-0999E6D18DC620055-024070815-r8KbDAvfSCu8tCCZ7llkuA%3D%3D; CURRENT_FNVAL=4048; rpdid=|(u))kJumk~~0J\'u~kluuJk)l; header_theme_version=CLOSE; fingerprint=13c5a0300ec32b6c6d2a5a81a8682b18; buvid_fp_plain=undefined; buvid_fp=13c5a0300ec32b6c6d2a5a81a8682b18; bp_t_offset_512104208=975890211160457216; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjYzMjExMjksImlhdCI6MTcyNjA2MTg2OSwicGx0IjotMX0.wd7VeJfGfUQR0Ofh0z5fnmQ1CpNKEjECYT5nAJWTgEQ; bili_ticket_expires=1726321069; bp_t_offset_3546760311998987=976235500761251840; b_lsid=81077410AF_191E6741DE6; bsource=search_baidu; home_feed_column=5; browser_resolution=2100-1095; SESSDATA=7e54226a%2C1741705429%2C85264%2A92CjC-dRoO18jeyJJLwwGvwJs00QhNm1jjKoflnsViX87MSQJOgo3T2MRFIWOTYe5GVuASVnd4YVRLa0NKN2NDWTdBVFZWb2hYR0M2RXBoTnRyeTliNnptVlFJWWhYWnZtam9pNmpNcm54Q00ybzNUenVxSng5UC1hY2VyYUQ5Y0U3czQ3NHRGOEFBIIEC; bili_jct=87feb43d0b6b672ca3d06eca43c442b1; DedeUserID=512104208; DedeUserID__ckMd5=d02f77dab719b614; sid=q6cirm22',
'Origin': 'https: // search.bilibili.com',
'Referer': 'https://www.bilibili.com/video'
}
response = requests.get(url=SEARCH_API_URL, headers=headers)
##print(response.text)
hrefs = [
response.text
]
# 定义正则表达式模式
pattern = r'video/(BV\w+)'
# 提取所有匹配项
all_bv_numbers = []
for href in hrefs:
matches = re.findall(pattern, href)
all_bv_numbers.extend(matches)
#all_bv_numbers=set(all_bv_numbers)
#print("所有匹配的BV号:", all_bv_numbers)
for a in range(2,12):
b=30*a-30
SEARCH_API_URL = 'https://search.bilibili.com/all?keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.788&search_source=2'+str(a)+'&o='+str(b)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.5845.97 Safari/537.36 Core/1.116.438.400 QQBrowser/13.0.6070.400',
'Cookie': 'buvid3=37CAA054-A8A1-787E-CD23-05D8BC9EEB4780230infoc; b_nut=1725113280; _uuid=1C1E246E-F3B6-D13B-106CD-A109D35CB4D9B67205infoc; enable_web_push=DISABLE; buvid4=C1C36600-6C10-0893-8E67-0999E6D18DC620055-024070815-r8KbDAvfSCu8tCCZ7llkuA%3D%3D; CURRENT_FNVAL=4048; rpdid=|(u))kJumk~~0J\'u~kluuJk)l; header_theme_version=CLOSE; fingerprint=13c5a0300ec32b6c6d2a5a81a8682b18; buvid_fp_plain=undefined; buvid_fp=13c5a0300ec32b6c6d2a5a81a8682b18; bp_t_offset_512104208=975890211160457216; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjYzMjExMjksImlhdCI6MTcyNjA2MTg2OSwicGx0IjotMX0.wd7VeJfGfUQR0Ofh0z5fnmQ1CpNKEjECYT5nAJWTgEQ; bili_ticket_expires=1726321069; bp_t_offset_3546760311998987=976235500761251840; b_lsid=81077410AF_191E6741DE6; bsource=search_baidu; home_feed_column=5; browser_resolution=2100-1095; SESSDATA=7e54226a%2C1741705429%2C85264%2A92CjC-dRoO18jeyJJLwwGvwJs00QhNm1jjKoflnsViX87MSQJOgo3T2MRFIWOTYe5GVuASVnd4YVRLa0NKN2NDWTdBVFZWb2hYR0M2RXBoTnRyeTliNnptVlFJWWhYWnZtam9pNmpNcm54Q00ybzNUenVxSng5UC1hY2VyYUQ5Y0U3czQ3NHRGOEFBIIEC; bili_jct=87feb43d0b6b672ca3d06eca43c442b1; DedeUserID=512104208; DedeUserID__ckMd5=d02f77dab719b614; sid=q6cirm22',
}
response = requests.get(url=SEARCH_API_URL, headers=headers)
##print(response.text)
hrefs = [
response.text
]
# 定义正则表达式模式
pattern = r'video/(BV\w+)'
# 提取所有匹配项
for href in hrefs:
matches = re.findall(pattern, href)
all_bv_numbers.extend(matches)
a+=1
all_bv_numbers = set(all_bv_numbers)
print("所有匹配的BV号:", all_bv_numbers)
url_bag = []
# 根据oid请求弹幕解析弹幕得到最终的数据\
for bvv in all_bv_numbers:
url_bag.append(str('https://api.bilibili.com/x/player/pagelist?bvid='+str(bvv)+'&jsonp=jsonp'))
# 清理请求头中的非 ASCII 字符
def clean_header(header):
return ''.join([c if ord(c) < 128 else '' for c in header])
cidd=[]
print('working.....')
for url in url_bag:
headers={
'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Referer': 'https://www.bilibili.com/video',
'Cookie': 'buvid3=37CAA054-A8A1-787E-CD23-05D8BC9EEB4780230infoc; b_nut=1725113280; _uuid=1C1E246E-F3B6-D13B-106CD-A109D35CB4D9B67205infoc; enable_web_push=DISABLE; buvid4=C1C36600-6C10-0893-8E67-0999E6D18DC620055-024070815-r8KbDAvfSCu8tCCZ7llkuA%3D%3D; CURRENT_FNVAL=4048; rpdid=|(u))kJumk~~0J\'u~kluuJk)l; header_theme_version=CLOSE; fingerprint=13c5a0300ec32b6c6d2a5a81a8682b18; buvid_fp_plain=undefined; buvid_fp=13c5a0300ec32b6c6d2a5a81a8682b18; bp_t_offset_512104208=976311809445199872; home_feed_column=4; bp_t_offset_3546760311998987=977044801448837120; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY3NTkwNTIsImlhdCI6MTcyNjQ5OTc5MiwicGx0IjotMX0.nxZ5VaGrzO5b6-53keyv-cFkgHJHJc3AlsstvpFLRqU; bili_ticket_expires=1726758992; b_lsid=FEC2F3BD_19204AAF252; bsource=search_baidu; browser_resolution=1396-753; SESSDATA=248d492e%2C1742208568%2C3b9b3%2A92CjAlo1XIj642R8msLp1dRmDFhrz5qww2VpfuVKpIfW1YPu6THL88hGTut7JX0fhZ3ScSVmFHcktRWnprQnVVWmVKOU13UXU3QTd3ZVV5SDJjQUh1VnQ5dEREZlhOanhQZHhhendBWF9oVlo3WVBkamFCemNvajVOLWpORGlOdlNBT0hhejlzUjd3IIEC; bili_jct=b7b93ee0674805b95693a57e794dec38; DedeUserID=16908229; DedeUserID__ckMd5=a77e39913b0a23e9; sid=7iszo700',
}
try:
response = requests.get(url=url, headers=headers)
response.raise_for_status() # 检查请求是否成功
# 打印响应文本
json_dict = json.loads(response.text)
cid = json_dict["data"][0]["cid"]
cidd.append(cid)
except requests.RequestException as e:
print(f"请求错误: {e}")
except json.JSONDecodeError as e:
print(f"JSON解码错误: {e}")
except KeyError as e:
print(f"键错误: {e}, 可能是数据结构不符合预期")
except UnicodeEncodeError as e:
print(f"编码错误: {e}")
# 增加请求间隔
print("cid 获取好了!")
print("接下来是一段较为长时间的数据解析过程,请稍作等待...")
cnt=0
result = open('total_data.xls', 'w', encoding='utf-8')
ciyun_file=open('danmu.txt','w',encoding='utf-8')
workbook = openpyxl.Workbook()
# 获取默认的工作表
sheet = workbook.active
# 写入数据
sheet['A1'] = 'data'
key_dataa=[]
for data in cidd:
url = 'https://api.bilibili.com/x/v2/dm/web/history/seg.so?type=1&oid='+str(data)+'&date=2024-09-06'
headers = {'User-Agent': '"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Referer': 'https://www.bilibili.com/video',
'Cookie': 'buvid3=37CAA054-A8A1-787E-CD23-05D8BC9EEB4780230infoc; b_nut=1725113280; _uuid=1C1E246E-F3B6-D13B-106CD-A109D35CB4D9B67205infoc; enable_web_push=DISABLE; buvid4=C1C36600-6C10-0893-8E67-0999E6D18DC620055-024070815-r8KbDAvfSCu8tCCZ7llkuA%3D%3D; CURRENT_FNVAL=4048; rpdid=|(u))kJumk~~0J\'u~kluuJk)l; header_theme_version=CLOSE; fingerprint=13c5a0300ec32b6c6d2a5a81a8682b18; buvid_fp_plain=undefined; buvid_fp=13c5a0300ec32b6c6d2a5a81a8682b18; bp_t_offset_512104208=976311809445199872; home_feed_column=4; bp_t_offset_3546760311998987=977044801448837120; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY3NTkwNTIsImlhdCI6MTcyNjQ5OTc5MiwicGx0IjotMX0.nxZ5VaGrzO5b6-53keyv-cFkgHJHJc3AlsstvpFLRqU; bili_ticket_expires=1726758992; b_lsid=FEC2F3BD_19204AAF252; bsource=search_baidu; browser_resolution=1396-753; SESSDATA=248d492e%2C1742208568%2C3b9b3%2A92CjAlo1XIj642R8msLp1dRmDFhrz5qww2VpfuVKpIfW1YPu6THL88hGTut7JX0fhZ3ScSVmFHcktRWnprQnVVWmVKOU13UXU3QTd3ZVV5SDJjQUh1VnQ5dEREZlhOanhQZHhhendBWF9oVlo3WVBkamFCemNvajVOLWpORGlOdlNBT0hhejlzUjd3IIEC; bili_jct=b7b93ee0674805b95693a57e794dec38; DedeUserID=16908229; DedeUserID__ckMd5=a77e39913b0a23e9; sid=7iszo700',
}
response = requests.get(url=url, headers=headers)
mama=response.text
pattern1 = ':([^@]*)@'
dataa = re.findall(pattern1, response.text)
keyword = ''
# 提取含有关键词的元素
filtered_elements = []
key_dataa=key_dataa+dataa
for a in dataa:
#去除违法字符
illegal_chars = ['\x00', '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07',
'\x08', '\x0b', '\x0c', '\x0e', '\x0f', '\x10', '\x11', '\x12',
'\x13', '\x14', '\x15', '\x16', '\x17', '\x18', '\x19', '\x1a',
'\x1b', '\x1c', '\x1d', '\x1e', '\x1f', '\x7f']
for char in illegal_chars:
a = a.replace(char, '.')
a = str(a)
sheet.append([a])
ciyun_file.write(a)
time.sleep(1)
#完成图表
keywords = ['全红婵', '潘展乐']
keyword_counts = count_keywords(key_dataa, keywords)
visualize_data(keyword_counts)
workbook.save('total_data.xls')
keywords = ['ai','智能','AI','人工']
# 读取原始表格数据
df = pd.read_excel('total_data.xls')
# 检查每一行是否含有关键字
pattern = r'(?<![a-zA-Z])(' + '|'.join(keywords) + r')(?![a-zA-Z])'
# 检查每一行是否含有关键字,并确保关键字前后没有字母
contains_keywords = df.iloc[:, 0].apply(lambda x: bool(re.search(pattern, str(x))))
# 筛选出含有关键字的行
filtered_df = df[contains_keywords]
# 创建一个新的Excel工作簿
wb = Workbook()
ws = wb.active
# 将含有关键字的行写入到新表格中
for row in pd.DataFrame(filtered_df).itertuples(index=False):
ws.append(row)
# 保存新表格
wb.save('keyword.xlsx')
# 关闭工作簿
wb.close()
result.close()