You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

181 lines
7.2 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import requests
import re
from bs4 import BeautifulSoup
from collections import Counter
from openpyxl import load_workbook
import pandas as pd
import jieba
import wordcloud
import imageio
# 模拟浏览器
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0",
"cookie": "CURRENT_FNVAL=4048; buvid_fp_plain=undefined; buvid4=04DF7AEF-34D9-CC62-690A-D369B35D458509591-023061415-%2FxwqHe8zHTWav6Q4ZiB1Ag%3D%3D; enable_web_push=DISABLE; FEED_LIVE_VERSION=V_WATCHLATER_PIP_WINDOW3; PVID=1; buvid3=D5B12366-476E-6163-1D79-774D300DF97306537infoc; b_nut=1718270506; _uuid=243B710F9-1010E3-9654-E867-4A8D8BB10AB1307743infoc; header_theme_version=CLOSE; rpdid=0zbfAHMKHr|S8rGMSwG|1uI|3w1Sum1G; fingerprint=042b265e3c7da3104d09a0692278e922; CURRENT_QUALITY=80; home_feed_column=5; browser_resolution=1659-836; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjU5NDEwOTEsImlhdCI6MTcyNTY4MTgzMSwicGx0IjotMX0.j7rN8z5QOwH-7R7gPvyBxJzDLqymAWFfZeFF-QAXoTQ; bili_ticket_expires=1725941031; bp_t_offset_482950113=974463371485118464; buvid_fp=042b265e3c7da3104d09a0692278e922; b_lsid=DDE103767_191D4FCA152"
}
def contains_ai_or_artificial_intelligence(text):
ai_pattern = re.compile( r'(\bai\b|人工智能|([\u4e00-\u9fff]|\s|^)ai([\u4e00-\u9fff]|\s|$))', re.IGNORECASE)
return re.search(ai_pattern, text)
# 获取html文本
def get_html(url):
response = requests.get(url,headers=headers)
response.encoding = 'utf-8'
html=response.text
return html
# 查找正确的api链接
def seek_api_urls(html_data):
soup = BeautifulSoup(html_data, 'html.parser')
#创建列表储存筛选完的内容
urls = set()
# 筛选a标签内容
a_tags=soup.find_all('a', href=True)
for a_link in a_tags:
# 获取href的值
link = a_link['href']
urls.add(link)
# 筛选正确的链接
pattern = re.compile(r'https://api\.bilibili\.com/x/v1/dm/list\.so\?')
api_urls = [url_find for url_find in urls if pattern.match(url_find)]
#返回链接值
return api_urls
# 获取弹幕接口链接函数
def get_api_urls(url):
response = requests.get(url, headers=headers)
if response.status_code == 200:
# 若请求成功则查找api链接
html_data=response.text
api_urls=seek_api_urls(html_data)
return api_urls
else:
# 返回一个空列表作为默认值
return []
# 获取视频接口函数
def get_urls(page):
# 获得搜索页面url
url = f"https://search.bilibili.com/video?keyword=%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=2&page={page}"
html_data=get_html(url)
soup = BeautifulSoup(html_data, 'html.parser')
# 创建列表储存筛选完的内容
urls = set()
a_tags=soup.find_all('a', href=True)
for a_link in a_tags:
link = a_link['href']
# 补全链接
full_link=f'https:{link}'
urls.add(full_link)
# 筛选正确的链接
pattern = re.compile(r'https://www\.bilibili\.com/video')
#7x42=294前七页全部读取
if page != 8:
vedieo_urls_f = [url_find for url_find in urls if pattern.match(url_find)]
return vedieo_urls_f
#第8页只读6个
else: vedieo_urls_f = []
num = 0
for url_find in urls:
if pattern.match(url_find):
num = num + 1
vedieo_urls_f.append(url_find)
if num == 6:
return vedieo_urls_f
#获取接口链接
def vedio_transform_port(url):
html_data = get_html(url)
soup = BeautifulSoup(html_data,"html.parser")
page_num = [] #储存总共的分p数
span_tag = None #用做判断有无分p的flag
# 分p视频部分源代码如下
# <div class="head-left">
# <h3>视频选集</h3>
# <span class="cur-page">(1/12)</span>
div_tags = soup.findAll("div",attrs={"class":"head-left"}) #找到class=head-left的div
for tag in div_tags:
span_tag=(tag.findAll("span",attrs={"class":"cur-page"})) #再从中找到class=cur-page的span
if span_tag == None: #值为None则为单个视频
port_url = url.replace("bilibili.com", "ibilibili.com")
port_urls.add(port_url)
else:
for page in span_tag:
pages = jieba.lcut(page.get_text()) #取得span的内容“x/y)"用jieba拆分成'(','x','/','y',')',其中y即为分p总数
page_num = pages[3] #取得y的值
# 替换每个分p视频的链接
for page in range(1,int(page_num)+1):
port_url = f"{url}?p={page}"
port_urls.add(port_url.replace("bilibili.com", "ibilibili.com"))
# 循环10页每页42个视频总300个
for page in range(1,8):
# 获取视频链接
vedio_urls=get_urls(page)
# 创建接口链接列表
port_urls=set()
for vedio_url in vedio_urls:
# 将视频链接转换成接口链接
port_url = vedio_transform_port(vedio_url)
# 循环访问接口
for url in port_urls:
#获取弹幕链接
api_urls=get_api_urls(url)
# 检查列表是否为空
if api_urls:
#不为空,则将获取弹幕链接
api_url = api_urls[0]
html_data = get_html(api_url)
soup = BeautifulSoup(html_data, 'html.parser')
content_list =re.findall('<d p=".*?">(.*?)</d>',html_data)
content='\n'.join(content_list)
with open('弹幕.txt',mode='a',encoding='utf-8') as f:
f.write(content)
ai_list = [] #用于储存关于ai弹幕
most_common_barrages = [] #储存数量前八弹幕
with open('弹幕.txt', 'r', encoding='utf-8') as file:
content_txt = file.readlines() # 按行读取弹幕
for barrage in content_txt:
if contains_ai_or_artificial_intelligence(barrage): #筛选关于ai的弹幕
ai_list.append(barrage.strip()) # 使用strip()去除每行的换行符
# 使用Counter统计每个弹幕的出现次数
counter = Counter(ai_list)
# 获取出现次数最多的前8个弹幕
most_common_barrages = counter.most_common(8)
#转变类型才可以写入excel
ai_list1 = counter.most_common()
# 输出结果
for barrage, count in most_common_barrages:
print(f'弹幕: {barrage} 出现次数: {count}')
# 将数据转换为DataFrame
df = pd.DataFrame(ai_list1, columns=['弹幕', '出现次数'])
# 写入Excel文件
excel_path = '弹幕统计.xlsx'
df.to_excel(excel_path, index=False, engine='openpyxl')
# 调整列宽
wb = load_workbook(excel_path)
ws = wb.active
# 设置“弹幕”列的宽度
ws.column_dimensions['A'].width = 60
# 保存修改后的Excel文件
wb.save(excel_path)
ai_str = '\n'.join(ai_list) #分割成字符型
#绘制词云图
img = imageio.imread('test2.png')
wc = wordcloud.WordCloud(
width = 500,
height = 500,
mask=img,
background_color = 'white',
font_path = 'msyh.ttc'
)
wc.generate(ai_str)
wc.to_file('词云.png')