|
|
|
@ -1,219 +0,0 @@
|
|
|
|
|
import requests
|
|
|
|
|
import re
|
|
|
|
|
import pandas as pd
|
|
|
|
|
import json
|
|
|
|
|
import time
|
|
|
|
|
import openpyxl
|
|
|
|
|
from openpyxl import Workbook
|
|
|
|
|
import matplotlib.pyplot as plt
|
|
|
|
|
|
|
|
|
|
# 设置Matplotlib的字体
|
|
|
|
|
plt.rcParams['font.sans-serif'] = ['SimHei'] # 使用黑体
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def count_keywords(danmu_data, keywords):
|
|
|
|
|
keyword_counts = {keyword: 0 for keyword in keywords}
|
|
|
|
|
for danmu in danmu_data:
|
|
|
|
|
for keyword in keywords:
|
|
|
|
|
keyword_counts[keyword] += danmu.count(keyword)
|
|
|
|
|
return keyword_counts
|
|
|
|
|
|
|
|
|
|
# 数据可视化
|
|
|
|
|
def visualize_data(keyword_counts):
|
|
|
|
|
plt.bar(keyword_counts.keys(), keyword_counts.values())
|
|
|
|
|
plt.xlabel('Keywords')
|
|
|
|
|
plt.ylabel('Counts')
|
|
|
|
|
plt.title('Keyword Counts in Danmu Data')
|
|
|
|
|
plt.show()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
SEARCH_API_URL ='https://search.bilibili.com/all?keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.788&search_source=2'
|
|
|
|
|
|
|
|
|
|
headers = {'User-Agent': '"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
|
|
|
|
|
'Cookie': 'buvid3=37CAA054-A8A1-787E-CD23-05D8BC9EEB4780230infoc; b_nut=1725113280; _uuid=1C1E246E-F3B6-D13B-106CD-A109D35CB4D9B67205infoc; enable_web_push=DISABLE; buvid4=C1C36600-6C10-0893-8E67-0999E6D18DC620055-024070815-r8KbDAvfSCu8tCCZ7llkuA%3D%3D; CURRENT_FNVAL=4048; rpdid=|(u))kJumk~~0J\'u~kluuJk)l; header_theme_version=CLOSE; fingerprint=13c5a0300ec32b6c6d2a5a81a8682b18; buvid_fp_plain=undefined; buvid_fp=13c5a0300ec32b6c6d2a5a81a8682b18; bp_t_offset_512104208=975890211160457216; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjYzMjExMjksImlhdCI6MTcyNjA2MTg2OSwicGx0IjotMX0.wd7VeJfGfUQR0Ofh0z5fnmQ1CpNKEjECYT5nAJWTgEQ; bili_ticket_expires=1726321069; bp_t_offset_3546760311998987=976235500761251840; b_lsid=81077410AF_191E6741DE6; bsource=search_baidu; home_feed_column=5; browser_resolution=2100-1095; SESSDATA=7e54226a%2C1741705429%2C85264%2A92CjC-dRoO18jeyJJLwwGvwJs00QhNm1jjKoflnsViX87MSQJOgo3T2MRFIWOTYe5GVuASVnd4YVRLa0NKN2NDWTdBVFZWb2hYR0M2RXBoTnRyeTliNnptVlFJWWhYWnZtam9pNmpNcm54Q00ybzNUenVxSng5UC1hY2VyYUQ5Y0U3czQ3NHRGOEFBIIEC; bili_jct=87feb43d0b6b672ca3d06eca43c442b1; DedeUserID=512104208; DedeUserID__ckMd5=d02f77dab719b614; sid=q6cirm22',
|
|
|
|
|
'Origin': 'https: // search.bilibili.com',
|
|
|
|
|
'Referer': 'https://www.bilibili.com/video'
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
response = requests.get(url=SEARCH_API_URL, headers=headers)
|
|
|
|
|
##print(response.text)
|
|
|
|
|
hrefs = [
|
|
|
|
|
response.text
|
|
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
# 定义正则表达式模式
|
|
|
|
|
pattern = r'video/(BV\w+)'
|
|
|
|
|
|
|
|
|
|
# 提取所有匹配项
|
|
|
|
|
all_bv_numbers = []
|
|
|
|
|
for href in hrefs:
|
|
|
|
|
matches = re.findall(pattern, href)
|
|
|
|
|
all_bv_numbers.extend(matches)
|
|
|
|
|
#all_bv_numbers=set(all_bv_numbers)
|
|
|
|
|
#print("所有匹配的BV号:", all_bv_numbers)
|
|
|
|
|
for a in range(2,12):
|
|
|
|
|
b=30*a-30
|
|
|
|
|
SEARCH_API_URL = 'https://search.bilibili.com/all?keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.788&search_source=2'+str(a)+'&o='+str(b)
|
|
|
|
|
|
|
|
|
|
headers = {
|
|
|
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.5845.97 Safari/537.36 Core/1.116.438.400 QQBrowser/13.0.6070.400',
|
|
|
|
|
'Cookie': 'buvid3=37CAA054-A8A1-787E-CD23-05D8BC9EEB4780230infoc; b_nut=1725113280; _uuid=1C1E246E-F3B6-D13B-106CD-A109D35CB4D9B67205infoc; enable_web_push=DISABLE; buvid4=C1C36600-6C10-0893-8E67-0999E6D18DC620055-024070815-r8KbDAvfSCu8tCCZ7llkuA%3D%3D; CURRENT_FNVAL=4048; rpdid=|(u))kJumk~~0J\'u~kluuJk)l; header_theme_version=CLOSE; fingerprint=13c5a0300ec32b6c6d2a5a81a8682b18; buvid_fp_plain=undefined; buvid_fp=13c5a0300ec32b6c6d2a5a81a8682b18; bp_t_offset_512104208=975890211160457216; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjYzMjExMjksImlhdCI6MTcyNjA2MTg2OSwicGx0IjotMX0.wd7VeJfGfUQR0Ofh0z5fnmQ1CpNKEjECYT5nAJWTgEQ; bili_ticket_expires=1726321069; bp_t_offset_3546760311998987=976235500761251840; b_lsid=81077410AF_191E6741DE6; bsource=search_baidu; home_feed_column=5; browser_resolution=2100-1095; SESSDATA=7e54226a%2C1741705429%2C85264%2A92CjC-dRoO18jeyJJLwwGvwJs00QhNm1jjKoflnsViX87MSQJOgo3T2MRFIWOTYe5GVuASVnd4YVRLa0NKN2NDWTdBVFZWb2hYR0M2RXBoTnRyeTliNnptVlFJWWhYWnZtam9pNmpNcm54Q00ybzNUenVxSng5UC1hY2VyYUQ5Y0U3czQ3NHRGOEFBIIEC; bili_jct=87feb43d0b6b672ca3d06eca43c442b1; DedeUserID=512104208; DedeUserID__ckMd5=d02f77dab719b614; sid=q6cirm22',
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
response = requests.get(url=SEARCH_API_URL, headers=headers)
|
|
|
|
|
##print(response.text)
|
|
|
|
|
hrefs = [
|
|
|
|
|
response.text
|
|
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
# 定义正则表达式模式
|
|
|
|
|
pattern = r'video/(BV\w+)'
|
|
|
|
|
|
|
|
|
|
# 提取所有匹配项
|
|
|
|
|
|
|
|
|
|
for href in hrefs:
|
|
|
|
|
matches = re.findall(pattern, href)
|
|
|
|
|
all_bv_numbers.extend(matches)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
a+=1
|
|
|
|
|
all_bv_numbers = set(all_bv_numbers)
|
|
|
|
|
print("所有匹配的BV号:", all_bv_numbers)
|
|
|
|
|
|
|
|
|
|
url_bag = []
|
|
|
|
|
|
|
|
|
|
# 根据oid请求弹幕,解析弹幕得到最终的数据\
|
|
|
|
|
for bvv in all_bv_numbers:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
url_bag.append(str('https://api.bilibili.com/x/player/pagelist?bvid='+str(bvv)+'&jsonp=jsonp'))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 清理请求头中的非 ASCII 字符
|
|
|
|
|
def clean_header(header):
|
|
|
|
|
return ''.join([c if ord(c) < 128 else '' for c in header])
|
|
|
|
|
cidd=[]
|
|
|
|
|
print('working.....')
|
|
|
|
|
for url in url_bag:
|
|
|
|
|
|
|
|
|
|
headers={
|
|
|
|
|
'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
|
|
|
|
|
'Referer': 'https://www.bilibili.com/video',
|
|
|
|
|
'Cookie': 'buvid3=37CAA054-A8A1-787E-CD23-05D8BC9EEB4780230infoc; b_nut=1725113280; _uuid=1C1E246E-F3B6-D13B-106CD-A109D35CB4D9B67205infoc; enable_web_push=DISABLE; buvid4=C1C36600-6C10-0893-8E67-0999E6D18DC620055-024070815-r8KbDAvfSCu8tCCZ7llkuA%3D%3D; CURRENT_FNVAL=4048; rpdid=|(u))kJumk~~0J\'u~kluuJk)l; header_theme_version=CLOSE; fingerprint=13c5a0300ec32b6c6d2a5a81a8682b18; buvid_fp_plain=undefined; buvid_fp=13c5a0300ec32b6c6d2a5a81a8682b18; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjYzMjExMjksImlhdCI6MTcyNjA2MTg2OSwicGx0IjotMX0.wd7VeJfGfUQR0Ofh0z5fnmQ1CpNKEjECYT5nAJWTgEQ; bili_ticket_expires=1726321069; bsource=search_baidu; home_feed_column=5; browser_resolution=2100-1095; b_lsid=2C2F5DE8_191E6C6B17F; bp_t_offset_512104208=976311809445199872; SESSDATA=64f9af2c%2C1741705746%2C252bc%2A92CjBco6PCEf5jHGOtLwBCnpRRhnXcR0SL850C4F5X8GK2eVczaFKXrWQv4b7zWBkS77cSVmtEdktONDd2RUx0NTQybzAwZTdlOU1jNGlvbnZuN0YzU3VOZFBPZVBWMEVqWFN5a3hBdlVYTWkwaXZQTndCR3FQUkVIT1N1elpCNUI5UFZPc2JTYzNnIIEC; bili_jct=0dbb281cc47245a2970662a9f04112ea; DedeUserID=3546760311998987; DedeUserID__ckMd5=945a443ae2d0a983; sid=8uthyfdc; bp_t_offset_3546760311998987=976312754338004992',
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
response = requests.get(url=url, headers=headers)
|
|
|
|
|
response.raise_for_status() # 检查请求是否成功
|
|
|
|
|
# 打印响应文本
|
|
|
|
|
json_dict = json.loads(response.text)
|
|
|
|
|
cid = json_dict["data"][0]["cid"]
|
|
|
|
|
|
|
|
|
|
cidd.append(cid)
|
|
|
|
|
except requests.RequestException as e:
|
|
|
|
|
print(f"请求错误: {e}")
|
|
|
|
|
except json.JSONDecodeError as e:
|
|
|
|
|
print(f"JSON解码错误: {e}")
|
|
|
|
|
except KeyError as e:
|
|
|
|
|
print(f"键错误: {e}, 可能是数据结构不符合预期")
|
|
|
|
|
except UnicodeEncodeError as e:
|
|
|
|
|
print(f"编码错误: {e}")
|
|
|
|
|
time.sleep(2)
|
|
|
|
|
|
|
|
|
|
# 增加请求间隔
|
|
|
|
|
|
|
|
|
|
print("cid 获取好了!")
|
|
|
|
|
print("接下来是一段较为长时间的数据解析过程,请稍作等待...")
|
|
|
|
|
|
|
|
|
|
cnt=0
|
|
|
|
|
result = open('total_data.xls', 'w', encoding='utf-8')
|
|
|
|
|
ciyun_file=open('danmu.txt','w',encoding='utf-8')
|
|
|
|
|
workbook = openpyxl.Workbook()
|
|
|
|
|
|
|
|
|
|
# 获取默认的工作表
|
|
|
|
|
sheet = workbook.active
|
|
|
|
|
|
|
|
|
|
# 写入数据
|
|
|
|
|
sheet['A1'] = 'data'
|
|
|
|
|
key_dataa=[]
|
|
|
|
|
for data in cidd:
|
|
|
|
|
url = 'https://api.bilibili.com/x/v2/dm/web/history/seg.so?type=1&oid='+str(data)+'&date=2024-09-06'
|
|
|
|
|
|
|
|
|
|
headers = {'User-Agent': '"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
|
|
|
|
|
'Referer': 'https://www.bilibili.com/video',
|
|
|
|
|
'Cookie': 'buvid3=37CAA054-A8A1-787E-CD23-05D8BC9EEB4780230infoc; b_nut=1725113280; _uuid=1C1E246E-F3B6-D13B-106CD-A109D35CB4D9B67205infoc; enable_web_push=DISABLE; buvid4=C1C36600-6C10-0893-8E67-0999E6D18DC620055-024070815-r8KbDAvfSCu8tCCZ7llkuA%3D%3D; CURRENT_FNVAL=4048; rpdid=|(u))kJumk~~0J\'u~kluuJk)l; header_theme_version=CLOSE; fingerprint=13c5a0300ec32b6c6d2a5a81a8682b18; buvid_fp_plain=undefined; buvid_fp=13c5a0300ec32b6c6d2a5a81a8682b18; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjYzMjExMjksImlhdCI6MTcyNjA2MTg2OSwicGx0IjotMX0.wd7VeJfGfUQR0Ofh0z5fnmQ1CpNKEjECYT5nAJWTgEQ; bili_ticket_expires=1726321069; bp_t_offset_3546760311998987=976235500761251840; bsource=search_baidu; home_feed_column=5; browser_resolution=2100-1095; b_lsid=2C2F5DE8_191E6C6B17F; bp_t_offset_512104208=976311809445199872; SESSDATA=64f9af2c%2C1741705746%2C252bc%2A92CjBco6PCEf5jHGOtLwBCnpRRhnXcR0SL850C4F5X8GK2eVczaFKXrWQv4b7zWBkS77cSVmtEdktONDd2RUx0NTQybzAwZTdlOU1jNGlvbnZuN0YzU3VOZFBPZVBWMEVqWFN5a3hBdlVYTWkwaXZQTndCR3FQUkVIT1N1elpCNUI5UFZPc2JTYzNnIIEC; bili_jct=0dbb281cc47245a2970662a9f04112ea; DedeUserID=3546760311998987; DedeUserID__ckMd5=945a443ae2d0a983; sid=8uthyfdc',
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
response = requests.get(url=url, headers=headers)
|
|
|
|
|
|
|
|
|
|
mama=response.text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pattern1 = ':([^@]*)@'
|
|
|
|
|
|
|
|
|
|
dataa = re.findall(pattern1, response.text)
|
|
|
|
|
keyword = '你'
|
|
|
|
|
|
|
|
|
|
# 提取含有关键词的元素
|
|
|
|
|
filtered_elements = []
|
|
|
|
|
key_dataa=key_dataa+dataa
|
|
|
|
|
for a in dataa:
|
|
|
|
|
#去除违法字符
|
|
|
|
|
illegal_chars = ['\x00', '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07',
|
|
|
|
|
'\x08', '\x0b', '\x0c', '\x0e', '\x0f', '\x10', '\x11', '\x12',
|
|
|
|
|
'\x13', '\x14', '\x15', '\x16', '\x17', '\x18', '\x19', '\x1a',
|
|
|
|
|
'\x1b', '\x1c', '\x1d', '\x1e', '\x1f', '\x7f']
|
|
|
|
|
for char in illegal_chars:
|
|
|
|
|
a = a.replace(char, '.')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
a = str(a)
|
|
|
|
|
sheet.append([a])
|
|
|
|
|
ciyun_file.write(a)
|
|
|
|
|
#完成图表
|
|
|
|
|
keywords = ['全红婵', '潘展乐']
|
|
|
|
|
keyword_counts = count_keywords(key_dataa, keywords)
|
|
|
|
|
|
|
|
|
|
visualize_data(keyword_counts)
|
|
|
|
|
workbook.save('total_data.xls')
|
|
|
|
|
keywords = ['ai','智能','AI','人工']
|
|
|
|
|
|
|
|
|
|
# 读取原始表格数据
|
|
|
|
|
df = pd.read_excel('total_data.xls')
|
|
|
|
|
|
|
|
|
|
# 检查每一行是否含有关键字
|
|
|
|
|
pattern = r'(?<![a-zA-Z])(' + '|'.join(keywords) + r')(?![a-zA-Z])'
|
|
|
|
|
|
|
|
|
|
# 检查每一行是否含有关键字,并确保关键字前后没有字母
|
|
|
|
|
contains_keywords = df.iloc[:, 0].apply(lambda x: bool(re.search(pattern, str(x))))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 筛选出含有关键字的行
|
|
|
|
|
filtered_df = df[contains_keywords]
|
|
|
|
|
|
|
|
|
|
# 创建一个新的Excel工作簿
|
|
|
|
|
wb = Workbook()
|
|
|
|
|
ws = wb.active
|
|
|
|
|
|
|
|
|
|
# 将含有关键字的行写入到新表格中
|
|
|
|
|
for row in pd.DataFrame(filtered_df).itertuples(index=False):
|
|
|
|
|
ws.append(row)
|
|
|
|
|
|
|
|
|
|
# 保存新表格
|
|
|
|
|
wb.save('keyword.xlsx')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 关闭工作簿
|
|
|
|
|
wb.close()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
result.close()
|