|
|
from bs4 import BeautifulSoup
|
|
|
import re
|
|
|
import pandas as pd
|
|
|
import jieba
|
|
|
import requests
|
|
|
import imageio
|
|
|
import wordcloud
|
|
|
from openpyxl import load_workbook
|
|
|
from collections import Counter
|
|
|
|
|
|
#获取User-Agent和cookie
|
|
|
headers = {
|
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0",
|
|
|
"cookie": "buvid3=0C047DB7-FB67-6565-B853-68B19196AEE053166infoc; buvid4=D2E32722-EB31-8B5B-8BC7-420F049CDE3657801-022071821-mG8+jYWtWHQ35A9yqIgZIA%3D%3D; buvid_fp=60e37bdf4fe67cde89d283db25adff46; _uuid=FCEA6C48-BB82-123A-61106-3F5410106BB410B03170infoc; b_nut=100; header_theme_version=CLOSE; enable_web_push=DISABLE; bsource=search_bing; CURRENT_FNVAL=4048; SESSDATA=aa6a6590%2C1742210524%2C7b1c4%2A92CjCxud8rqp6tuF7AYkzmJF0YS7_L4_80iMI3NuY5q-M7BEW3cf0_bVyhIcnZMJapP7YSVnJiQ2NVcTJZZ1ZIMFduRURJXzZXOWtaTTl2WnBFSHkwckM0UzdwY2xHMG9MNVl4c1pUSHlFaFJ4RnQ5WjY3ZHRtcm5qcDhNSVo3eXZORDczc0VlYlF3IIEC; bili_jct=5232d057d308c18c1419d19271a3b85e; DedeUserID=1576579979; DedeUserID__ckMd5=da7d6054e70acbba; home_feed_column=5; browser_resolution=1528-748; bp_t_offset_1576579979=978508367389523968; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY5MjQzMzUsImlhdCI6MTcyNjY2NTA3NSwicGx0IjotMX0.9m3fjjjWd1wCsWNsTPwS9afVCknRz7dWtL6JV0CTQgI; bili_ticket_expires=1726924275; b_lsid=E4323108A_19205503541; sid=4q83ttnl; rpdid=|(u))kkY|mmJ0J'u~kYYYmmml"
|
|
|
}
|
|
|
#检查文本是否包含“ai”和“人工智能”字样
|
|
|
#此函数用于检查
|
|
|
def contains_ai_or_artificial_intelligence(text):
|
|
|
ai_pattern = re.compile( r'(\bai\b|人工智能|([\u4e00-\u9fff]|\s|^)ai([\u4e00-\u9fff]|\s|$))', re.IGNORECASE)
|
|
|
return re.search(ai_pattern, text)
|
|
|
|
|
|
#获取html文本
|
|
|
#该函数发送HTTP GET请求到指定URL,并返回网页的源码
|
|
|
def get_html(url):
|
|
|
response = requests.get(url,headers=headers)
|
|
|
response.encoding = 'utf-8'
|
|
|
html=response.text
|
|
|
return html
|
|
|
|
|
|
# 查找正确的api链接
|
|
|
#解析HTML数据,提取标签
|
|
|
def seek_api_urls(html_data):
|
|
|
soup = BeautifulSoup(html_data, 'html.parser')
|
|
|
urls = set()
|
|
|
a_tags=soup.find_all('a', href=True)
|
|
|
for a_link in a_tags:
|
|
|
# 获取href的值
|
|
|
link = a_link['href']
|
|
|
urls.add(link)
|
|
|
# 筛选正确的链接
|
|
|
pattern = re.compile(r'https://api\.bilibili\.com/x/v1/dm/list\.so\?')
|
|
|
api_urls = [url_find for url_find in urls if pattern.match(url_find)]
|
|
|
#返回链接值
|
|
|
return api_urls
|
|
|
|
|
|
# 获取弹幕接口链接函数
|
|
|
def get_api_urls(url):
|
|
|
response = requests.get(url, headers=headers)
|
|
|
if response.status_code == 200:
|
|
|
# 若请求成功则查找api链接
|
|
|
html_data=response.text
|
|
|
api_urls=seek_api_urls(html_data)
|
|
|
return api_urls
|
|
|
else:
|
|
|
# 返回一个空列表作为默认值
|
|
|
return []
|
|
|
|
|
|
# 获取视频接口函数
|
|
|
def get_urls(page):
|
|
|
# 获得搜索页面url
|
|
|
url = f"https://search.bilibili.com/video?keyword=%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=2&page={page}"
|
|
|
html_data=get_html(url)
|
|
|
soup = BeautifulSoup(html_data, 'html.parser')
|
|
|
# 创建列表储存筛选完的内容
|
|
|
urls = set()
|
|
|
a_tags=soup.find_all('a', href=True)
|
|
|
for a_link in a_tags:
|
|
|
link = a_link['href']
|
|
|
# 补全链接
|
|
|
full_link=f'https:{link}'
|
|
|
urls.add(full_link)
|
|
|
# 筛选正确的链接
|
|
|
pattern = re.compile(r'https://www\.bilibili\.com/video')
|
|
|
#7x42=294,前七页全部读取
|
|
|
if page != 8:
|
|
|
vedieo_urls_f = [url_find for url_find in urls if pattern.match(url_find)]
|
|
|
return vedieo_urls_f
|
|
|
#第8页只读6个
|
|
|
else: vedieo_urls_f = []
|
|
|
num = 0
|
|
|
for url_find in urls:
|
|
|
if pattern.match(url_find):
|
|
|
num = num + 1
|
|
|
vedieo_urls_f.append(url_find)
|
|
|
if num == 6:
|
|
|
return vedieo_urls_f
|
|
|
|
|
|
#获取接口链接
|
|
|
def vedio_transform_port(url):
|
|
|
html_data = get_html(url)
|
|
|
soup = BeautifulSoup(html_data,"html.parser")
|
|
|
page_num = [] #储存总共的分p数
|
|
|
span_tag = None #用做判断有无分p的flag
|
|
|
|
|
|
div_tags = soup.findAll("div",attrs={"class":"head-left"}) #找到class=head-left的div
|
|
|
for tag in div_tags:
|
|
|
span_tag=(tag.findAll("span",attrs={"class":"cur-page"})) #再从中找到class=cur-page的span
|
|
|
|
|
|
if span_tag == None: #值为None则为单个视频
|
|
|
port_url = url.replace("bilibili.com", "ibilibili.com")
|
|
|
port_urls.add(port_url)
|
|
|
else:
|
|
|
for page in span_tag:
|
|
|
pages = jieba.lcut(page.get_text()) #取得span的内容“(x/y)",用jieba拆分成'(','x','/','y',')',其中y即为分p总数
|
|
|
page_num = pages[3] #取得y的值
|
|
|
# 替换每个分p视频的链接
|
|
|
for page in range(1,int(page_num)+1):
|
|
|
port_url = f"{url}?p={page}"
|
|
|
port_urls.add(port_url.replace("bilibili.com", "ibilibili.com"))
|
|
|
|
|
|
for page in range(1,8):
|
|
|
# 获取视频链接
|
|
|
vedio_urls=get_urls(page)
|
|
|
# 创建接口链接列表
|
|
|
port_urls=set()
|
|
|
for vedio_url in vedio_urls:
|
|
|
# 将视频链接转换成接口链接
|
|
|
port_url = vedio_transform_port(vedio_url)
|
|
|
|
|
|
# 循环访问接口
|
|
|
for url in port_urls:
|
|
|
#获取弹幕链接
|
|
|
api_urls=get_api_urls(url)
|
|
|
# 检查列表是否为空
|
|
|
if api_urls:
|
|
|
#不为空,则将获取弹幕链接
|
|
|
api_url = api_urls[0]
|
|
|
html_data = get_html(api_url)
|
|
|
soup = BeautifulSoup(html_data, 'html.parser')
|
|
|
content_list =re.findall('<d p=".*?">(.*?)</d>',html_data)
|
|
|
content='\n'.join(content_list)
|
|
|
with open('弹幕.txt',mode='a',encoding='utf-8') as f:
|
|
|
f.write(content)
|
|
|
|
|
|
ai_list = []
|
|
|
most_common_barrages = []
|
|
|
|
|
|
with open('弹幕.txt', 'r', encoding='utf-8') as file:
|
|
|
content_txt = file.readlines()
|
|
|
for barrage in content_txt:
|
|
|
if contains_ai_or_artificial_intelligence(barrage):
|
|
|
ai_list.append(barrage.strip())
|
|
|
# 使用Counter统计每个弹幕的出现次数
|
|
|
counter = Counter(ai_list)
|
|
|
# 获取出现次数最多的前8个弹幕
|
|
|
most_common_barrages = counter.most_common(8)
|
|
|
#转变类型才可以写入excel
|
|
|
ai_list1 = counter.most_common()
|
|
|
# 输出结果
|
|
|
for barrage, count in most_common_barrages:
|
|
|
print(f'弹幕: {barrage} 出现次数: {count}')
|
|
|
|
|
|
df = pd.DataFrame(ai_list1, columns=['弹幕', '出现次数'])
|
|
|
# 写入Excel文件
|
|
|
excel_path = '弹幕统计.xlsx'
|
|
|
df.to_excel(excel_path, index=False, engine='openpyxl')
|
|
|
|
|
|
wb = load_workbook(excel_path)
|
|
|
ws = wb.active
|
|
|
|
|
|
ws.column_dimensions['A'].width = 60
|
|
|
|
|
|
wb.save(excel_path)
|
|
|
|
|
|
ai_str = '\n'.join(ai_list) #分割成字符型
|