ADD file via upload

main
pfc8hp2r6 11 months ago
parent d7b68aae73
commit f0b8ae150e

@ -0,0 +1,222 @@
import requests
import re
import pandas as pd
import json
import time
import openpyxl
from openpyxl import Workbook
import matplotlib.pyplot as plt
# 设置Matplotlib的字体
plt.rcParams['font.sans-serif'] = ['SimHei'] # 使用黑体
plt.rcParams['axes.unicode_minus'] = False # 正确显示负号
def count_keywords(danmu_data, keywords):
keyword_counts = {keyword: 0 for keyword in keywords}
for danmu in danmu_data:
for keyword in keywords:
keyword_counts[keyword] += danmu.count(keyword)
return keyword_counts
# 数据可视化
def visualize_data(keyword_counts):
plt.bar(keyword_counts.keys(), keyword_counts.values())
plt.xlabel('Keywords')
plt.ylabel('Counts')
plt.title('Keyword Counts in Danmu Data')
plt.show()
SEARCH_API_URL ='https://search.bilibili.com/all?keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.788&search_source=2'
headers = {'User-Agent': '"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Cookie': 'buvid3=37CAA054-A8A1-787E-CD23-05D8BC9EEB4780230infoc; b_nut=1725113280; _uuid=1C1E246E-F3B6-D13B-106CD-A109D35CB4D9B67205infoc; enable_web_push=DISABLE; buvid4=C1C36600-6C10-0893-8E67-0999E6D18DC620055-024070815-r8KbDAvfSCu8tCCZ7llkuA%3D%3D; CURRENT_FNVAL=4048; rpdid=|(u))kJumk~~0J\'u~kluuJk)l; header_theme_version=CLOSE; fingerprint=13c5a0300ec32b6c6d2a5a81a8682b18; buvid_fp_plain=undefined; buvid_fp=13c5a0300ec32b6c6d2a5a81a8682b18; bp_t_offset_512104208=975890211160457216; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjYzMjExMjksImlhdCI6MTcyNjA2MTg2OSwicGx0IjotMX0.wd7VeJfGfUQR0Ofh0z5fnmQ1CpNKEjECYT5nAJWTgEQ; bili_ticket_expires=1726321069; bp_t_offset_3546760311998987=976235500761251840; b_lsid=81077410AF_191E6741DE6; bsource=search_baidu; home_feed_column=5; browser_resolution=2100-1095; SESSDATA=7e54226a%2C1741705429%2C85264%2A92CjC-dRoO18jeyJJLwwGvwJs00QhNm1jjKoflnsViX87MSQJOgo3T2MRFIWOTYe5GVuASVnd4YVRLa0NKN2NDWTdBVFZWb2hYR0M2RXBoTnRyeTliNnptVlFJWWhYWnZtam9pNmpNcm54Q00ybzNUenVxSng5UC1hY2VyYUQ5Y0U3czQ3NHRGOEFBIIEC; bili_jct=87feb43d0b6b672ca3d06eca43c442b1; DedeUserID=512104208; DedeUserID__ckMd5=d02f77dab719b614; sid=q6cirm22',
'Origin': 'https: // search.bilibili.com',
'Referer': 'https://www.bilibili.com/video'
}
response = requests.get(url=SEARCH_API_URL, headers=headers)
##print(response.text)
hrefs = [
response.text
]
# 定义正则表达式模式
pattern = r'video/(BV\w+)'
# 提取所有匹配项
all_bv_numbers = []
for href in hrefs:
matches = re.findall(pattern, href)
all_bv_numbers.extend(matches)
#all_bv_numbers=set(all_bv_numbers)
#print("所有匹配的BV号:", all_bv_numbers)
for a in range(2,12):
b=30*a-30
SEARCH_API_URL = 'https://search.bilibili.com/all?keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.788&search_source=2'+str(a)+'&o='+str(b)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.5845.97 Safari/537.36 Core/1.116.438.400 QQBrowser/13.0.6070.400',
'Cookie': 'buvid3=37CAA054-A8A1-787E-CD23-05D8BC9EEB4780230infoc; b_nut=1725113280; _uuid=1C1E246E-F3B6-D13B-106CD-A109D35CB4D9B67205infoc; enable_web_push=DISABLE; buvid4=C1C36600-6C10-0893-8E67-0999E6D18DC620055-024070815-r8KbDAvfSCu8tCCZ7llkuA%3D%3D; CURRENT_FNVAL=4048; rpdid=|(u))kJumk~~0J\'u~kluuJk)l; header_theme_version=CLOSE; fingerprint=13c5a0300ec32b6c6d2a5a81a8682b18; buvid_fp_plain=undefined; buvid_fp=13c5a0300ec32b6c6d2a5a81a8682b18; bp_t_offset_512104208=975890211160457216; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjYzMjExMjksImlhdCI6MTcyNjA2MTg2OSwicGx0IjotMX0.wd7VeJfGfUQR0Ofh0z5fnmQ1CpNKEjECYT5nAJWTgEQ; bili_ticket_expires=1726321069; bp_t_offset_3546760311998987=976235500761251840; b_lsid=81077410AF_191E6741DE6; bsource=search_baidu; home_feed_column=5; browser_resolution=2100-1095; SESSDATA=7e54226a%2C1741705429%2C85264%2A92CjC-dRoO18jeyJJLwwGvwJs00QhNm1jjKoflnsViX87MSQJOgo3T2MRFIWOTYe5GVuASVnd4YVRLa0NKN2NDWTdBVFZWb2hYR0M2RXBoTnRyeTliNnptVlFJWWhYWnZtam9pNmpNcm54Q00ybzNUenVxSng5UC1hY2VyYUQ5Y0U3czQ3NHRGOEFBIIEC; bili_jct=87feb43d0b6b672ca3d06eca43c442b1; DedeUserID=512104208; DedeUserID__ckMd5=d02f77dab719b614; sid=q6cirm22',
}
response = requests.get(url=SEARCH_API_URL, headers=headers)
##print(response.text)
hrefs = [
response.text
]
# 定义正则表达式模式
pattern = r'video/(BV\w+)'
# 提取所有匹配项
for href in hrefs:
matches = re.findall(pattern, href)
all_bv_numbers.extend(matches)
a+=1
all_bv_numbers = set(all_bv_numbers)
print("所有匹配的BV号:", all_bv_numbers)
url_bag = []
# 根据oid请求弹幕解析弹幕得到最终的数据\
for bvv in all_bv_numbers:
url_bag.append(str('https://api.bilibili.com/x/player/pagelist?bvid='+str(bvv)+'&jsonp=jsonp'))
# 清理请求头中的非 ASCII 字符
def clean_header(header):
return ''.join([c if ord(c) < 128 else '' for c in header])
cidd=[]
print('working.....')
for url in url_bag:
headers={
'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Referer': 'https://www.bilibili.com/video',
'Cookie': 'buvid3=37CAA054-A8A1-787E-CD23-05D8BC9EEB4780230infoc; b_nut=1725113280; _uuid=1C1E246E-F3B6-D13B-106CD-A109D35CB4D9B67205infoc; enable_web_push=DISABLE; buvid4=C1C36600-6C10-0893-8E67-0999E6D18DC620055-024070815-r8KbDAvfSCu8tCCZ7llkuA%3D%3D; CURRENT_FNVAL=4048; rpdid=|(u))kJumk~~0J\'u~kluuJk)l; header_theme_version=CLOSE; fingerprint=13c5a0300ec32b6c6d2a5a81a8682b18; buvid_fp_plain=undefined; buvid_fp=13c5a0300ec32b6c6d2a5a81a8682b18; bp_t_offset_512104208=976311809445199872; home_feed_column=4; bp_t_offset_3546760311998987=977044801448837120; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY3NTkwNTIsImlhdCI6MTcyNjQ5OTc5MiwicGx0IjotMX0.nxZ5VaGrzO5b6-53keyv-cFkgHJHJc3AlsstvpFLRqU; bili_ticket_expires=1726758992; b_lsid=FEC2F3BD_19204AAF252; bsource=search_baidu; browser_resolution=1396-753; SESSDATA=248d492e%2C1742208568%2C3b9b3%2A92CjAlo1XIj642R8msLp1dRmDFhrz5qww2VpfuVKpIfW1YPu6THL88hGTut7JX0fhZ3ScSVmFHcktRWnprQnVVWmVKOU13UXU3QTd3ZVV5SDJjQUh1VnQ5dEREZlhOanhQZHhhendBWF9oVlo3WVBkamFCemNvajVOLWpORGlOdlNBT0hhejlzUjd3IIEC; bili_jct=b7b93ee0674805b95693a57e794dec38; DedeUserID=16908229; DedeUserID__ckMd5=a77e39913b0a23e9; sid=7iszo700',
}
try:
response = requests.get(url=url, headers=headers)
response.raise_for_status() # 检查请求是否成功
# 打印响应文本
json_dict = json.loads(response.text)
cid = json_dict["data"][0]["cid"]
cidd.append(cid)
except requests.RequestException as e:
print(f"请求错误: {e}")
except json.JSONDecodeError as e:
print(f"JSON解码错误: {e}")
except KeyError as e:
print(f"键错误: {e}, 可能是数据结构不符合预期")
except UnicodeEncodeError as e:
print(f"编码错误: {e}")
# 增加请求间隔
print("cid 获取好了!")
print("接下来是一段较为长时间的数据解析过程,请稍作等待...")
cnt=0
result = open('total_data.xls', 'w', encoding='utf-8')
ciyun_file=open('danmu.txt','w',encoding='utf-8')
workbook = openpyxl.Workbook()
# 获取默认的工作表
sheet = workbook.active
# 写入数据
sheet['A1'] = 'data'
key_dataa=[]
for data in cidd:
url = 'https://api.bilibili.com/x/v2/dm/web/history/seg.so?type=1&oid='+str(data)+'&date=2024-09-06'
headers = {'User-Agent': '"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Referer': 'https://www.bilibili.com/video',
'Cookie': 'buvid3=37CAA054-A8A1-787E-CD23-05D8BC9EEB4780230infoc; b_nut=1725113280; _uuid=1C1E246E-F3B6-D13B-106CD-A109D35CB4D9B67205infoc; enable_web_push=DISABLE; buvid4=C1C36600-6C10-0893-8E67-0999E6D18DC620055-024070815-r8KbDAvfSCu8tCCZ7llkuA%3D%3D; CURRENT_FNVAL=4048; rpdid=|(u))kJumk~~0J\'u~kluuJk)l; header_theme_version=CLOSE; fingerprint=13c5a0300ec32b6c6d2a5a81a8682b18; buvid_fp_plain=undefined; buvid_fp=13c5a0300ec32b6c6d2a5a81a8682b18; bp_t_offset_512104208=976311809445199872; home_feed_column=4; bp_t_offset_3546760311998987=977044801448837120; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY3NTkwNTIsImlhdCI6MTcyNjQ5OTc5MiwicGx0IjotMX0.nxZ5VaGrzO5b6-53keyv-cFkgHJHJc3AlsstvpFLRqU; bili_ticket_expires=1726758992; b_lsid=FEC2F3BD_19204AAF252; bsource=search_baidu; browser_resolution=1396-753; SESSDATA=248d492e%2C1742208568%2C3b9b3%2A92CjAlo1XIj642R8msLp1dRmDFhrz5qww2VpfuVKpIfW1YPu6THL88hGTut7JX0fhZ3ScSVmFHcktRWnprQnVVWmVKOU13UXU3QTd3ZVV5SDJjQUh1VnQ5dEREZlhOanhQZHhhendBWF9oVlo3WVBkamFCemNvajVOLWpORGlOdlNBT0hhejlzUjd3IIEC; bili_jct=b7b93ee0674805b95693a57e794dec38; DedeUserID=16908229; DedeUserID__ckMd5=a77e39913b0a23e9; sid=7iszo700',
}
response = requests.get(url=url, headers=headers)
mama=response.text
pattern1 = ':([^@]*)@'
dataa = re.findall(pattern1, response.text)
keyword = ''
# 提取含有关键词的元素
filtered_elements = []
key_dataa=key_dataa+dataa
for a in dataa:
#去除违法字符
illegal_chars = ['\x00', '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07',
'\x08', '\x0b', '\x0c', '\x0e', '\x0f', '\x10', '\x11', '\x12',
'\x13', '\x14', '\x15', '\x16', '\x17', '\x18', '\x19', '\x1a',
'\x1b', '\x1c', '\x1d', '\x1e', '\x1f', '\x7f']
for char in illegal_chars:
a = a.replace(char, '.')
a = str(a)
sheet.append([a])
ciyun_file.write(a)
time.sleep(1)
#完成图表
keywords = ['全红婵', '潘展乐']
keyword_counts = count_keywords(key_dataa, keywords)
visualize_data(keyword_counts)
workbook.save('total_data.xls')
keywords = ['ai','智能','AI','人工']
# 读取原始表格数据
df = pd.read_excel('total_data.xls')
# 检查每一行是否含有关键字
pattern = r'(?<![a-zA-Z])(' + '|'.join(keywords) + r')(?![a-zA-Z])'
# 检查每一行是否含有关键字,并确保关键字前后没有字母
contains_keywords = df.iloc[:, 0].apply(lambda x: bool(re.search(pattern, str(x))))
# 筛选出含有关键字的行
filtered_df = df[contains_keywords]
# 创建一个新的Excel工作簿
wb = Workbook()
ws = wb.active
# 将含有关键字的行写入到新表格中
for row in pd.DataFrame(filtered_df).itertuples(index=False):
ws.append(row)
# 保存新表格
wb.save('keyword.xlsx')
# 关闭工作簿
wb.close()
result.close()
Loading…
Cancel
Save