ADD file via upload

main
pfc8hp2r6 11 months ago
parent e00154dbc3
commit b3e194e5e2

@ -0,0 +1,201 @@
import requests
import re
import pandas as pd
import xlwt
import json
import chardet
import random
import time
import openpyxl
from openpyxl import Workbook
import xlrd
import xlwings as xw
from openpyxl.reader.excel import load_workbook
# 随机获取一个请求头
SEARCH_API_URL ='https://search.bilibili.com/all?keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.788&search_source=2'
headers = {'User-Agent': '"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Cookie': 'buvid3=37CAA054-A8A1-787E-CD23-05D8BC9EEB4780230infoc; b_nut=1725113280; _uuid=1C1E246E-F3B6-D13B-106CD-A109D35CB4D9B67205infoc; enable_web_push=DISABLE; buvid4=C1C36600-6C10-0893-8E67-0999E6D18DC620055-024070815-r8KbDAvfSCu8tCCZ7llkuA%3D%3D; CURRENT_FNVAL=4048; rpdid=|(u))kJumk~~0J\'u~kluuJk)l; header_theme_version=CLOSE; fingerprint=13c5a0300ec32b6c6d2a5a81a8682b18; buvid_fp_plain=undefined; buvid_fp=13c5a0300ec32b6c6d2a5a81a8682b18; bp_t_offset_512104208=975890211160457216; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjYzMjExMjksImlhdCI6MTcyNjA2MTg2OSwicGx0IjotMX0.wd7VeJfGfUQR0Ofh0z5fnmQ1CpNKEjECYT5nAJWTgEQ; bili_ticket_expires=1726321069; bp_t_offset_3546760311998987=976235500761251840; b_lsid=81077410AF_191E6741DE6; bsource=search_baidu; home_feed_column=5; browser_resolution=2100-1095; SESSDATA=7e54226a%2C1741705429%2C85264%2A92CjC-dRoO18jeyJJLwwGvwJs00QhNm1jjKoflnsViX87MSQJOgo3T2MRFIWOTYe5GVuASVnd4YVRLa0NKN2NDWTdBVFZWb2hYR0M2RXBoTnRyeTliNnptVlFJWWhYWnZtam9pNmpNcm54Q00ybzNUenVxSng5UC1hY2VyYUQ5Y0U3czQ3NHRGOEFBIIEC; bili_jct=87feb43d0b6b672ca3d06eca43c442b1; DedeUserID=512104208; DedeUserID__ckMd5=d02f77dab719b614; sid=q6cirm22',
'Origin': 'https: // search.bilibili.com',
'Referer': 'https://www.bilibili.com/video'
}
response = requests.get(url=SEARCH_API_URL, headers=headers)
##print(response.text)
hrefs = [
response.text
]
# 定义正则表达式模式
pattern = r'video/(BV\w+)'
# 提取所有匹配项
all_bv_numbers = []
for href in hrefs:
matches = re.findall(pattern, href)
all_bv_numbers.extend(matches)
#all_bv_numbers=set(all_bv_numbers)
#print("所有匹配的BV号:", all_bv_numbers)
for a in range(2,12):
b=30*a-30
SEARCH_API_URL = 'https://search.bilibili.com/all?keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.788&search_source=2'+str(a)+'&o='+str(b)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.5845.97 Safari/537.36 Core/1.116.438.400 QQBrowser/13.0.6070.400',
'Cookie': 'buvid3=37CAA054-A8A1-787E-CD23-05D8BC9EEB4780230infoc; b_nut=1725113280; _uuid=1C1E246E-F3B6-D13B-106CD-A109D35CB4D9B67205infoc; enable_web_push=DISABLE; buvid4=C1C36600-6C10-0893-8E67-0999E6D18DC620055-024070815-r8KbDAvfSCu8tCCZ7llkuA%3D%3D; CURRENT_FNVAL=4048; rpdid=|(u))kJumk~~0J\'u~kluuJk)l; header_theme_version=CLOSE; fingerprint=13c5a0300ec32b6c6d2a5a81a8682b18; buvid_fp_plain=undefined; buvid_fp=13c5a0300ec32b6c6d2a5a81a8682b18; bp_t_offset_512104208=975890211160457216; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjYzMjExMjksImlhdCI6MTcyNjA2MTg2OSwicGx0IjotMX0.wd7VeJfGfUQR0Ofh0z5fnmQ1CpNKEjECYT5nAJWTgEQ; bili_ticket_expires=1726321069; bp_t_offset_3546760311998987=976235500761251840; b_lsid=81077410AF_191E6741DE6; bsource=search_baidu; home_feed_column=5; browser_resolution=2100-1095; SESSDATA=7e54226a%2C1741705429%2C85264%2A92CjC-dRoO18jeyJJLwwGvwJs00QhNm1jjKoflnsViX87MSQJOgo3T2MRFIWOTYe5GVuASVnd4YVRLa0NKN2NDWTdBVFZWb2hYR0M2RXBoTnRyeTliNnptVlFJWWhYWnZtam9pNmpNcm54Q00ybzNUenVxSng5UC1hY2VyYUQ5Y0U3czQ3NHRGOEFBIIEC; bili_jct=87feb43d0b6b672ca3d06eca43c442b1; DedeUserID=512104208; DedeUserID__ckMd5=d02f77dab719b614; sid=q6cirm22',
}
response = requests.get(url=SEARCH_API_URL, headers=headers)
##print(response.text)
hrefs = [
response.text
]
# 定义正则表达式模式
pattern = r'video/(BV\w+)'
# 提取所有匹配项
for href in hrefs:
matches = re.findall(pattern, href)
all_bv_numbers.extend(matches)
a+=1
all_bv_numbers = set(all_bv_numbers)
print(len(all_bv_numbers))
print("所有匹配的BV号:", all_bv_numbers)
url_bag = []
# 根据oid请求弹幕解析弹幕得到最终的数据\
for bvv in all_bv_numbers:
url_bag.append(str('https://api.bilibili.com/x/player/pagelist?bvid='+str(bvv)+'&jsonp=jsonp'))
# 清理请求头中的非 ASCII 字符
def clean_header(header):
return ''.join([c if ord(c) < 128 else '' for c in header])
cidd=[]
for url in url_bag:
headers={
'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Referer': 'https://www.bilibili.com/video',
'Cookie': 'buvid3=37CAA054-A8A1-787E-CD23-05D8BC9EEB4780230infoc; b_nut=1725113280; _uuid=1C1E246E-F3B6-D13B-106CD-A109D35CB4D9B67205infoc; enable_web_push=DISABLE; buvid4=C1C36600-6C10-0893-8E67-0999E6D18DC620055-024070815-r8KbDAvfSCu8tCCZ7llkuA%3D%3D; CURRENT_FNVAL=4048; rpdid=|(u))kJumk~~0J\'u~kluuJk)l; header_theme_version=CLOSE; fingerprint=13c5a0300ec32b6c6d2a5a81a8682b18; buvid_fp_plain=undefined; buvid_fp=13c5a0300ec32b6c6d2a5a81a8682b18; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjYzMjExMjksImlhdCI6MTcyNjA2MTg2OSwicGx0IjotMX0.wd7VeJfGfUQR0Ofh0z5fnmQ1CpNKEjECYT5nAJWTgEQ; bili_ticket_expires=1726321069; bsource=search_baidu; home_feed_column=5; browser_resolution=2100-1095; b_lsid=2C2F5DE8_191E6C6B17F; bp_t_offset_512104208=976311809445199872; SESSDATA=64f9af2c%2C1741705746%2C252bc%2A92CjBco6PCEf5jHGOtLwBCnpRRhnXcR0SL850C4F5X8GK2eVczaFKXrWQv4b7zWBkS77cSVmtEdktONDd2RUx0NTQybzAwZTdlOU1jNGlvbnZuN0YzU3VOZFBPZVBWMEVqWFN5a3hBdlVYTWkwaXZQTndCR3FQUkVIT1N1elpCNUI5UFZPc2JTYzNnIIEC; bili_jct=0dbb281cc47245a2970662a9f04112ea; DedeUserID=3546760311998987; DedeUserID__ckMd5=945a443ae2d0a983; sid=8uthyfdc; bp_t_offset_3546760311998987=976312754338004992',
}
try:
response = requests.get(url=url, headers=headers)
response.raise_for_status() # 检查请求是否成功
print("Response Text:", response.text) # 打印响应文本
json_dict = json.loads(response.text)
cid = json_dict["data"][0]["cid"]
print(cid)
cidd.append(cid)
except requests.RequestException as e:
print(f"请求错误: {e}")
except json.JSONDecodeError as e:
print(f"JSON解码错误: {e}")
except KeyError as e:
print(f"键错误: {e}, 可能是数据结构不符合预期")
except UnicodeEncodeError as e:
print(f"编码错误: {e}")
time.sleep(1)
# 增加请求间隔
cnt=0
result = open('total_data.xls', 'w', encoding='utf-8')
ciyun_file=open('danmu.txt','w',encoding='utf-8')
workbook = openpyxl.Workbook()
# 获取默认的工作表
sheet = workbook.active
# 写入数据
sheet['A1'] = 'data'
for data in cidd:
url = 'https://api.bilibili.com/x/v2/dm/web/history/seg.so?type=1&oid='+str(data)+'&date=2024-09-06'
headers = {'User-Agent': '"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Referer': 'https://www.bilibili.com/video',
'Cookie': 'buvid3=37CAA054-A8A1-787E-CD23-05D8BC9EEB4780230infoc; b_nut=1725113280; _uuid=1C1E246E-F3B6-D13B-106CD-A109D35CB4D9B67205infoc; enable_web_push=DISABLE; buvid4=C1C36600-6C10-0893-8E67-0999E6D18DC620055-024070815-r8KbDAvfSCu8tCCZ7llkuA%3D%3D; CURRENT_FNVAL=4048; rpdid=|(u))kJumk~~0J\'u~kluuJk)l; header_theme_version=CLOSE; fingerprint=13c5a0300ec32b6c6d2a5a81a8682b18; buvid_fp_plain=undefined; buvid_fp=13c5a0300ec32b6c6d2a5a81a8682b18; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjYzMjExMjksImlhdCI6MTcyNjA2MTg2OSwicGx0IjotMX0.wd7VeJfGfUQR0Ofh0z5fnmQ1CpNKEjECYT5nAJWTgEQ; bili_ticket_expires=1726321069; bp_t_offset_3546760311998987=976235500761251840; bsource=search_baidu; home_feed_column=5; browser_resolution=2100-1095; b_lsid=2C2F5DE8_191E6C6B17F; bp_t_offset_512104208=976311809445199872; SESSDATA=64f9af2c%2C1741705746%2C252bc%2A92CjBco6PCEf5jHGOtLwBCnpRRhnXcR0SL850C4F5X8GK2eVczaFKXrWQv4b7zWBkS77cSVmtEdktONDd2RUx0NTQybzAwZTdlOU1jNGlvbnZuN0YzU3VOZFBPZVBWMEVqWFN5a3hBdlVYTWkwaXZQTndCR3FQUkVIT1N1elpCNUI5UFZPc2JTYzNnIIEC; bili_jct=0dbb281cc47245a2970662a9f04112ea; DedeUserID=3546760311998987; DedeUserID__ckMd5=945a443ae2d0a983; sid=8uthyfdc',
}
response = requests.get(url=url, headers=headers)
mama=response.text
print(mama)
pattern1 = ':([^@]*)@'
dataa = re.findall(pattern1, response.text)
keyword = ''
# 提取含有关键词的元素
filtered_elements = []
for a in dataa:
illegal_chars = ['\x00', '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07',
'\x08', '\x0b', '\x0c', '\x0e', '\x0f', '\x10', '\x11', '\x12',
'\x13', '\x14', '\x15', '\x16', '\x17', '\x18', '\x19', '\x1a',
'\x1b', '\x1c', '\x1d', '\x1e', '\x1f', '\x7f']
for char in illegal_chars:
a = a.replace(char, '')
a = str(a)
sheet.append([a])
ciyun_file.write(a)
workbook.save('total_data.xls')
keywords = ['ai','智能']
# 读取原始表格数据
df = pd.read_excel('total_data.xls')
# 检查每一行是否含有关键字
pattern = r'(?<![a-zA-Z])(' + '|'.join(keywords) + r')(?![a-zA-Z])'
# 检查每一行是否含有关键字,并确保关键字前后没有字母
contains_keywords = df.iloc[:, 0].apply(lambda x: bool(re.search(pattern, str(x))))
# 筛选出含有关键字的行
filtered_df = df[contains_keywords]
# 创建一个新的Excel工作簿
wb = Workbook()
ws = wb.active
# 将含有关键字的行写入到新表格中
for row in pd.DataFrame(filtered_df).itertuples(index=False):
ws.append(row)
# 保存新表格
wb.save('keyword.xlsx')
time.sleep(1)
result.close()
Loading…
Cancel
Save