You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
homework/102201203王子怡.py

84 lines
4.0 KiB

import requests
import re
from bs4 import BeautifulSoup
from collections import Counter
from openpyxl import Workbook
import pandas as pd
import jieba
from wordcloud import WordCloud
import imageio
# 自定义请求头
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0",
"cookie": "buvid3=EC7F71FB-704D-BC2F-878E-060C9BB612D457758infoc; b_nut=1723797357; buvid_fp=c08f0cc51dccc720ed3a6ae23c5993ce; buvid4=BA329B3A-045C-D770-A220-0ABF4AB3A65459138-024081608-vt1IkSnsVWpP9n3wj7cBFA%3D%3D; _uuid=938B8637-6C16-573D-B41C-C2C777FB13B1097589infoc; header_theme_version=CLOSE; enable_web_push=DISABLE; is-2022-channel=1; CURRENT_FNVAL=4048; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY4Mzk5MTUsImlhdCI6MTcyNjU4MDY1NSwicGx0IjotMX0.pZvh0NsfTw53cQQz2zS7FURglT5oimgQSKIewfAyX0U; bili_ticket_expires=1726839855; SESSDATA=f2257e8a%2C1742138056%2C7342c%2A91CjBj2a9mnoec-WGYJHyOS54IzHo75eIEzOFcQpBxBRhIJ1QkCht2IjeodCewKBabaXkSVkhUdDZiQWlhTjZTUGtIbnJnNGN6S3JwQnExakMzSV8zWkJLTVNvMW5wMmhPSEg2VDkyZm90R0tGakhhSW9UNzloTHJ6QzdEV2kzVnd3bjg4VjZhX1NBIIEC; bili_jct=68e737446e9c9510ab766b465af1c75f; DedeUserID=104367431; DedeUserID__ckMd5=50fce7fb6de2fe29; rpdid=|(JlRYJ)|R~|0J'u~kYl)YJR|; bp_t_offset_104367431=978170954758750208; b_lsid=95B101073A_19202E92625; sid=849wwfno; bsource=search_bing; browser_resolution=1272-560; home_feed_column=4"
}
# 检查文本是否包含“AI”或“人工智能”
def check_ai_content(text):
pattern = re.compile(r'(\bai\b|人工智能|([\u4e00-\u9fff]|\s|^)ai([\u4e00-\u9fff]|\s|$))', re.IGNORECASE)
return pattern.search(text)
# 获取网页内容
def fetch_html(url):
response = requests.get(url, headers=headers)
response.encoding = 'utf-8'
return response.text
# 提取视频链接
def extract_video_links(html_data):
soup = BeautifulSoup(html_data, 'html.parser')
video_links = set()
for link in soup.find_all('a', href=True):
video_links.add(link['href'])
return video_links
# 获取视频弹幕API链接
def get_danmaku_api_links(video_url):
response = requests.get(video_url, headers=headers)
if response.status_code == 200:
html_data = response.text
return extract_danmaku_api_links(html_data)
return []
# 提取弹幕API链接
def extract_danmaku_api_links(html_data):
pattern = re.compile(r'https://api\.bilibili\.com/x/v1/dm/list\.so\?')
return [url for url in html_data.split('\n') if pattern.match(url)]
# 主程序
def main():
ai_comments = [] # 存储AI相关的弹幕
danmaku_api_links = [] # 存储弹幕API链接
# 循环获取视频链接
for page in range(1, 8):
video_links = extract_video_links(fetch_html(f"https://search.bilibili.com/video?keyword=%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=2&page={page}"))
for video_link in video_links:
danmaku_api_links.extend(get_danmaku_api_links(video_link))
# 获取弹幕数据并筛选AI相关的弹幕
for danmaku_api in danmaku_api_links:
html_data = fetch_html(danmaku_api)
soup = BeautifulSoup(html_data, 'html.parser')
content_list = [danmaku['content'] for danmaku in soup.find_all('d')]
for content in content_list:
if check_ai_content(content):
ai_comments.append(content)
# 输出AI相关的弹幕数量
print(f"AI相关的弹幕数量: {len(ai_comments)}")
# 将数据转换为DataFrame
df = pd.DataFrame(ai_comments, columns=['弹幕'])
# 写入Excel文件
df.to_excel('ai_comments.xlsx', index=False)
# 绘制词云图
ai_str = ' '.join(ai_comments)
wc = WordCloud(width=500, height=500, background_color='white', font_path='msyh.ttc')
wc.generate(ai_str)
wc.to_file('ai_wordcloud.png')
if __name__ == "__main__":
main()