You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
89 lines
4.8 KiB
89 lines
4.8 KiB
2 months ago
|
import requests
|
||
|
import re
|
||
|
import jieba
|
||
|
import wordcloud
|
||
|
import imageio
|
||
|
from urllib import request #请求模块
|
||
|
from fake_useragent import UserAgent #在线生成User-Agent
|
||
|
import time, re,csv
|
||
|
|
||
|
def get_urls(url):
|
||
|
ua = UserAgent()
|
||
|
headers = {
|
||
|
'User-Agent' : ua.random,
|
||
|
"cookie" : "CURRENT_FNVAL=4048; DedeUserID=599512569; DedeUserID__ckMd5=dc4a13272f0a9ea2; buvid3=D25424AB-ED5E-FB32-6058-CEAFCE9BED0785126infoc; b_nut=1702298485; _uuid=3B16C9CF-A4D3-157C-C788-4414C7D1CB6883241infoc; rpdid=|(u))kkYu|)l0J'u~ukYYJRRl; header_theme_version=CLOSE; enable_web_push=DISABLE; home_feed_column=5; buvid4=BC1C2D23-29FB-83CA-7F80-AFC2DC96707044936-024091707-xyz%2FLhNhkMgQoBuUX4qoqQ%3D%3D; SESSDATA=f7936ef9%2C1742109545%2Cb955c%2A92CjBowCYGlCGs8LxEIVDfq_sHffd2IenL64iEzkCgBPuRNCQHJSPLTi_2TRVl6WouW0QSVmZRNUtHWGhQc04xLWFXQndaNUNEelhjZ0V4TmVsemRDbUUxMXk0OHg3OVY3UnptZ09WckxmQ2VSSVdlMFg5MGR6VU9sa0RZSTZndjNkcVlzM0tmd0FRIIEC; bili_jct=f57986cccdf574fa8be013c21ba69f00; sid=4ovm4bl0; fingerprint=34f624a48e970862c6be836c5ea87b35; buvid_fp_plain=undefined; buvid_fp=34f624a48e970862c6be836c5ea87b35; browser_resolution=1659-941; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY4NDc0NDAsImlhdCI6MTcyNjU4ODE4MCwicGx0IjotMX0.h6GQaSN55QhTvi5tIFZ-hhhO7I5X55Pm2HR1R_k1_60; bili_ticket_expires=1726847380; b_lsid=171C6103F_19202722AED; bp_t_offset_599512569=978313367284350976",
|
||
|
|
||
|
}
|
||
|
|
||
|
req=request.Request(url=url,headers=headers) #创建请求对象
|
||
|
res=request.urlopen(req) #发起请求得到响应
|
||
|
html=res.read().decode()
|
||
|
|
||
|
|
||
|
re_dbs = '<a href="(.*?)"' #正则式,只提取括号内的内容
|
||
|
pattern = re.compile(re_dbs, re.S) #创建一个正则规则
|
||
|
r_list = pattern.findall(html)
|
||
|
|
||
|
r_list=list(set(r_list))
|
||
|
|
||
|
print(len(r_list))
|
||
|
with open("aaa.txt", 'a', newline='') as file:
|
||
|
for i in r_list:
|
||
|
a=i[2:-1]
|
||
|
file.write(a+'\n')
|
||
|
|
||
|
def get_barrage(url):
|
||
|
ua = UserAgent()
|
||
|
headers = {
|
||
|
'User-Agent' : ua.random,
|
||
|
"cookie" : "CURRENT_FNVAL=4048; DedeUserID=599512569; DedeUserID__ckMd5=dc4a13272f0a9ea2; buvid3=D25424AB-ED5E-FB32-6058-CEAFCE9BED0785126infoc; b_nut=1702298485; _uuid=3B16C9CF-A4D3-157C-C788-4414C7D1CB6883241infoc; rpdid=|(u))kkYu|)l0J'u~ukYYJRRl; header_theme_version=CLOSE; enable_web_push=DISABLE; home_feed_column=5; buvid4=BC1C2D23-29FB-83CA-7F80-AFC2DC96707044936-024091707-xyz%2FLhNhkMgQoBuUX4qoqQ%3D%3D; SESSDATA=f7936ef9%2C1742109545%2Cb955c%2A92CjBowCYGlCGs8LxEIVDfq_sHffd2IenL64iEzkCgBPuRNCQHJSPLTi_2TRVl6WouW0QSVmZRNUtHWGhQc04xLWFXQndaNUNEelhjZ0V4TmVsemRDbUUxMXk0OHg3OVY3UnptZ09WckxmQ2VSSVdlMFg5MGR6VU9sa0RZSTZndjNkcVlzM0tmd0FRIIEC; bili_jct=f57986cccdf574fa8be013c21ba69f00; sid=4ovm4bl0; fingerprint=34f624a48e970862c6be836c5ea87b35; buvid_fp_plain=undefined; browser_resolution=1659-941; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY4NDc0NDAsImlhdCI6MTcyNjU4ODE4MCwicGx0IjotMX0.h6GQaSN55QhTvi5tIFZ-hhhO7I5X55Pm2HR1R_k1_60; bili_ticket_expires=1726847380; bsource=search_baidu; bmg_af_switch=1; bmg_src_def_domain=i2.hdslb.com; buvid_fp=34f624a48e970862c6be836c5ea87b35; bp_t_offset_599512569=978443972944855040; b_lsid=3BEDE47B_1920473A505",
|
||
|
}
|
||
|
headers2={
|
||
|
'User-Agent': ua.random,
|
||
|
}
|
||
|
|
||
|
ri=requests.get(url=url,headers=headers) #创建请求对象
|
||
|
cid = re.search(r'"cid":(\d*),', ri.text).group(1)
|
||
|
time.sleep(1)
|
||
|
cid_url = f"https://comment.bilibili.com/{cid}.xml"
|
||
|
r2 = requests.get(cid_url, headers=headers2)
|
||
|
html_doc = r2.content.decode('utf-8')
|
||
|
format = re.compile("<d.*?>(.*?)</d>")
|
||
|
DanMu = format.findall(html_doc)
|
||
|
# # 逐个输出弹幕
|
||
|
with open("ba.txt", 'a', newline='') as f:
|
||
|
for i in DanMu[:-3]:
|
||
|
try:
|
||
|
f.write(i+'\n')
|
||
|
except:
|
||
|
print(1)
|
||
|
|
||
|
for i in range(2,14):
|
||
|
url=f'https://search.bilibili.com/video?keyword=%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=2&page={i+1}&o={i*30}'
|
||
|
get_urls(url)
|
||
|
print(i)
|
||
|
|
||
|
# https://comment.bilibili.com/1628109137.xml
|
||
|
with open('aaa.txt','r') as file:
|
||
|
i=0
|
||
|
while(1):
|
||
|
url1=file.readline().strip()
|
||
|
if len(url1)<35:
|
||
|
continue
|
||
|
if url1=='':
|
||
|
break
|
||
|
url='https://'+url1
|
||
|
# print(url)
|
||
|
get_barrage(url)
|
||
|
print(i)
|
||
|
i+=1
|
||
|
|
||
|
with open("ba.txt",'r') as f: #从ba.txt里读取存储的弹幕
|
||
|
while(1):
|
||
|
str=f.readline().strip()
|
||
|
if str=='':
|
||
|
break
|
||
|
if 'AI技术' in str:
|
||
|
with open('bb.xls','a',encoding='gbk',newline='') as file:
|
||
|
write=csv.writer(file)
|
||
|
write.writerow([str])
|