You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
danmu/102201214 许莎莎.py

166 lines
6.7 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

from bs4 import BeautifulSoup
import re
import pandas as pd
import jieba
import requests
import imageio
import wordcloud
from openpyxl import load_workbook
from collections import Counter
#获取User-Agent和cookie
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0",
"cookie": "buvid3=0C047DB7-FB67-6565-B853-68B19196AEE053166infoc; buvid4=D2E32722-EB31-8B5B-8BC7-420F049CDE3657801-022071821-mG8+jYWtWHQ35A9yqIgZIA%3D%3D; buvid_fp=60e37bdf4fe67cde89d283db25adff46; _uuid=FCEA6C48-BB82-123A-61106-3F5410106BB410B03170infoc; b_nut=100; header_theme_version=CLOSE; enable_web_push=DISABLE; bsource=search_bing; CURRENT_FNVAL=4048; SESSDATA=aa6a6590%2C1742210524%2C7b1c4%2A92CjCxud8rqp6tuF7AYkzmJF0YS7_L4_80iMI3NuY5q-M7BEW3cf0_bVyhIcnZMJapP7YSVnJiQ2NVcTJZZ1ZIMFduRURJXzZXOWtaTTl2WnBFSHkwckM0UzdwY2xHMG9MNVl4c1pUSHlFaFJ4RnQ5WjY3ZHRtcm5qcDhNSVo3eXZORDczc0VlYlF3IIEC; bili_jct=5232d057d308c18c1419d19271a3b85e; DedeUserID=1576579979; DedeUserID__ckMd5=da7d6054e70acbba; home_feed_column=5; browser_resolution=1528-748; bp_t_offset_1576579979=978508367389523968; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY5MjQzMzUsImlhdCI6MTcyNjY2NTA3NSwicGx0IjotMX0.9m3fjjjWd1wCsWNsTPwS9afVCknRz7dWtL6JV0CTQgI; bili_ticket_expires=1726924275; b_lsid=E4323108A_19205503541; sid=4q83ttnl; rpdid=|(u))kkY|mmJ0J'u~kYYYmmml"
}
#检查文本是否包含“ai”和“人工智能”字样
#此函数用于检查
def contains_ai_or_artificial_intelligence(text):
ai_pattern = re.compile( r'(\bai\b|人工智能|([\u4e00-\u9fff]|\s|^)ai([\u4e00-\u9fff]|\s|$))', re.IGNORECASE)
return re.search(ai_pattern, text)
#获取html文本
#该函数发送HTTP GET请求到指定URL并返回网页的源码
def get_html(url):
response = requests.get(url,headers=headers)
response.encoding = 'utf-8'
html=response.text
return html
# 查找正确的api链接
#解析HTML数据提取标签
def seek_api_urls(html_data):
soup = BeautifulSoup(html_data, 'html.parser')
urls = set()
a_tags=soup.find_all('a', href=True)
for a_link in a_tags:
# 获取href的值
link = a_link['href']
urls.add(link)
# 筛选正确的链接
pattern = re.compile(r'https://api\.bilibili\.com/x/v1/dm/list\.so\?')
api_urls = [url_find for url_find in urls if pattern.match(url_find)]
#返回链接值
return api_urls
# 获取弹幕接口链接函数
def get_api_urls(url):
response = requests.get(url, headers=headers)
if response.status_code == 200:
# 若请求成功则查找api链接
html_data=response.text
api_urls=seek_api_urls(html_data)
return api_urls
else:
# 返回一个空列表作为默认值
return []
# 获取视频接口函数
def get_urls(page):
# 获得搜索页面url
url = f"https://search.bilibili.com/video?keyword=%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=2&page={page}"
html_data=get_html(url)
soup = BeautifulSoup(html_data, 'html.parser')
# 创建列表储存筛选完的内容
urls = set()
a_tags=soup.find_all('a', href=True)
for a_link in a_tags:
link = a_link['href']
# 补全链接
full_link=f'https:{link}'
urls.add(full_link)
# 筛选正确的链接
pattern = re.compile(r'https://www\.bilibili\.com/video')
#7x42=294前七页全部读取
if page != 8:
vedieo_urls_f = [url_find for url_find in urls if pattern.match(url_find)]
return vedieo_urls_f
#第8页只读6个
else: vedieo_urls_f = []
num = 0
for url_find in urls:
if pattern.match(url_find):
num = num + 1
vedieo_urls_f.append(url_find)
if num == 6:
return vedieo_urls_f
#获取接口链接
def vedio_transform_port(url):
html_data = get_html(url)
soup = BeautifulSoup(html_data,"html.parser")
page_num = [] #储存总共的分p数
span_tag = None #用做判断有无分p的flag
div_tags = soup.findAll("div",attrs={"class":"head-left"}) #找到class=head-left的div
for tag in div_tags:
span_tag=(tag.findAll("span",attrs={"class":"cur-page"})) #再从中找到class=cur-page的span
if span_tag == None: #值为None则为单个视频
port_url = url.replace("bilibili.com", "ibilibili.com")
port_urls.add(port_url)
else:
for page in span_tag:
pages = jieba.lcut(page.get_text()) #取得span的内容“x/y)"用jieba拆分成'(','x','/','y',')',其中y即为分p总数
page_num = pages[3] #取得y的值
# 替换每个分p视频的链接
for page in range(1,int(page_num)+1):
port_url = f"{url}?p={page}"
port_urls.add(port_url.replace("bilibili.com", "ibilibili.com"))
for page in range(1,8):
# 获取视频链接
vedio_urls=get_urls(page)
# 创建接口链接列表
port_urls=set()
for vedio_url in vedio_urls:
# 将视频链接转换成接口链接
port_url = vedio_transform_port(vedio_url)
# 循环访问接口
for url in port_urls:
#获取弹幕链接
api_urls=get_api_urls(url)
# 检查列表是否为空
if api_urls:
#不为空,则将获取弹幕链接
api_url = api_urls[0]
html_data = get_html(api_url)
soup = BeautifulSoup(html_data, 'html.parser')
content_list =re.findall('<d p=".*?">(.*?)</d>',html_data)
content='\n'.join(content_list)
with open('弹幕.txt',mode='a',encoding='utf-8') as f:
f.write(content)
ai_list = []
most_common_barrages = []
with open('弹幕.txt', 'r', encoding='utf-8') as file:
content_txt = file.readlines()
for barrage in content_txt:
if contains_ai_or_artificial_intelligence(barrage):
ai_list.append(barrage.strip())
# 使用Counter统计每个弹幕的出现次数
counter = Counter(ai_list)
# 获取出现次数最多的前8个弹幕
most_common_barrages = counter.most_common(8)
#转变类型才可以写入excel
ai_list1 = counter.most_common()
# 输出结果
for barrage, count in most_common_barrages:
print(f'弹幕: {barrage} 出现次数: {count}')
df = pd.DataFrame(ai_list1, columns=['弹幕', '出现次数'])
# 写入Excel文件
excel_path = '弹幕统计.xlsx'
df.to_excel(excel_path, index=False, engine='openpyxl')
wb = load_workbook(excel_path)
ws = wb.active
ws.column_dimensions['A'].width = 60
wb.save(excel_path)
ai_str = '\n'.join(ai_list) #分割成字符型