homework/102201203王子怡.py

import requests
import re
from bs4 import BeautifulSoup
from collections import Counter
from openpyxl import Workbook
import pandas as pd
import jieba
from wordcloud import WordCloud
import imageio

# 自定义请求头
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0",
    "cookie": "buvid3=EC7F71FB-704D-BC2F-878E-060C9BB612D457758infoc; b_nut=1723797357; buvid_fp=c08f0cc51dccc720ed3a6ae23c5993ce; buvid4=BA329B3A-045C-D770-A220-0ABF4AB3A65459138-024081608-vt1IkSnsVWpP9n3wj7cBFA%3D%3D; _uuid=938B8637-6C16-573D-B41C-C2C777FB13B1097589infoc; header_theme_version=CLOSE; enable_web_push=DISABLE; is-2022-channel=1; CURRENT_FNVAL=4048; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY4Mzk5MTUsImlhdCI6MTcyNjU4MDY1NSwicGx0IjotMX0.pZvh0NsfTw53cQQz2zS7FURglT5oimgQSKIewfAyX0U; bili_ticket_expires=1726839855; SESSDATA=f2257e8a%2C1742138056%2C7342c%2A91CjBj2a9mnoec-WGYJHyOS54IzHo75eIEzOFcQpBxBRhIJ1QkCht2IjeodCewKBabaXkSVkhUdDZiQWlhTjZTUGtIbnJnNGN6S3JwQnExakMzSV8zWkJLTVNvMW5wMmhPSEg2VDkyZm90R0tGakhhSW9UNzloTHJ6QzdEV2kzVnd3bjg4VjZhX1NBIIEC; bili_jct=68e737446e9c9510ab766b465af1c75f; DedeUserID=104367431; DedeUserID__ckMd5=50fce7fb6de2fe29; rpdid=|(JlRYJ)|R~|0J'u~kYl)YJR|; bp_t_offset_104367431=978170954758750208; b_lsid=95B101073A_19202E92625; sid=849wwfno; bsource=search_bing; browser_resolution=1272-560; home_feed_column=4"
}

# 检查文本是否包含“AI”或“人工智能”
def check_ai_content(text):
    pattern = re.compile(r'(\bai\b|人工智能|([\u4e00-\u9fff]|\s|^)ai([\u4e00-\u9fff]|\s|$))', re.IGNORECASE)
    return pattern.search(text)

# 获取网页内容
def fetch_html(url):
    response = requests.get(url, headers=headers)
    response.encoding = 'utf-8'
    return response.text

# 提取视频链接
def extract_video_links(html_data):
    soup = BeautifulSoup(html_data, 'html.parser')
    video_links = set()
    for link in soup.find_all('a', href=True):
        video_links.add(link['href'])
    return video_links

# 获取视频弹幕API链接
def get_danmaku_api_links(video_url):
    response = requests.get(video_url, headers=headers)
    if response.status_code == 200:
        html_data = response.text
        return extract_danmaku_api_links(html_data)
    return []

# 提取弹幕API链接
def extract_danmaku_api_links(html_data):
    pattern = re.compile(r'https://api\.bilibili\.com/x/v1/dm/list\.so\?')
    return [url for url in html_data.split('\n') if pattern.match(url)]

# 主程序
def main():
    ai_comments = []  # 存储AI相关的弹幕
    danmaku_api_links = []  # 存储弹幕API链接

    # 循环获取视频链接
    for page in range(1, 8):
        video_links = extract_video_links(fetch_html(f"https://search.bilibili.com/video?keyword=%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=2&page={page}"))
        for video_link in video_links:
            danmaku_api_links.extend(get_danmaku_api_links(video_link))

    # 获取弹幕数据并筛选AI相关的弹幕
    for danmaku_api in danmaku_api_links:
        html_data = fetch_html(danmaku_api)
        soup = BeautifulSoup(html_data, 'html.parser')
        content_list = [danmaku['content'] for danmaku in soup.find_all('d')]
        for content in content_list:
            if check_ai_content(content):
                ai_comments.append(content)

    # 输出AI相关的弹幕数量
    print(f"AI相关的弹幕数量: {len(ai_comments)}")

    # 将数据转换为DataFrame
    df = pd.DataFrame(ai_comments, columns=['弹幕'])
    # 写入Excel文件
    df.to_excel('ai_comments.xlsx', index=False)

    # 绘制词云图
    ai_str = ' '.join(ai_comments)
    wc = WordCloud(width=500, height=500, background_color='white', font_path='msyh.ttc')
    wc.generate(ai_str)
    wc.to_file('ai_wordcloud.png')

if __name__ == "__main__":
    main()
ADD file via upload 2 months ago			`import requests`
			`import re`
			`from bs4 import BeautifulSoup`
			`from collections import Counter`
			`from openpyxl import Workbook`
			`import pandas as pd`
			`import jieba`
			`from wordcloud import WordCloud`
			`import imageio`

			`# 自定义请求头`
			`headers = {`
			`"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0",`
			"cookie": "buvid3=EC7F71FB-704D-BC2F-878E-060C9BB612D457758infoc; b_nut=1723797357; buvid_fp=c08f0cc51dccc720ed3a6ae23c5993ce; buvid4=BA329B3A-045C-D770-A220-0ABF4AB3A65459138-024081608-vt1IkSnsVWpP9n3wj7cBFA%3D%3D; _uuid=938B8637-6C16-573D-B41C-C2C777FB13B1097589infoc; header_theme_version=CLOSE; enable_web_push=DISABLE; is-2022-channel=1; CURRENT_FNVAL=4048; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY4Mzk5MTUsImlhdCI6MTcyNjU4MDY1NSwicGx0IjotMX0.pZvh0NsfTw53cQQz2zS7FURglT5oimgQSKIewfAyX0U; bili_ticket_expires=1726839855; SESSDATA=f2257e8a%2C1742138056%2C7342c%2A91CjBj2a9mnoec-WGYJHyOS54IzHo75eIEzOFcQpBxBRhIJ1QkCht2IjeodCewKBabaXkSVkhUdDZiQWlhTjZTUGtIbnJnNGN6S3JwQnExakMzSV8zWkJLTVNvMW5wMmhPSEg2VDkyZm90R0tGakhhSW9UNzloTHJ6QzdEV2kzVnd3bjg4VjZhX1NBIIEC; bili_jct=68e737446e9c9510ab766b465af1c75f; DedeUserID=104367431; DedeUserID__ckMd5=50fce7fb6de2fe29; rpdid=\|(JlRYJ)\|R~\|0J'u~kYl)YJR\|; bp_t_offset_104367431=978170954758750208; b_lsid=95B101073A_19202E92625; sid=849wwfno; bsource=search_bing; browser_resolution=1272-560; home_feed_column=4"
			`}`

			`# 检查文本是否包含“AI”或“人工智能”`
			`def check_ai_content(text):`
			`pattern = re.compile(r'(\bai\b\|人工智能\|([\u4e00-\u9fff]\|\s\|^)ai([\u4e00-\u9fff]\|\s\|$))', re.IGNORECASE)`
			`return pattern.search(text)`

			`# 获取网页内容`
			`def fetch_html(url):`
			`response = requests.get(url, headers=headers)`
			`response.encoding = 'utf-8'`
			`return response.text`

			`# 提取视频链接`
			`def extract_video_links(html_data):`
			`soup = BeautifulSoup(html_data, 'html.parser')`
			`video_links = set()`
			`for link in soup.find_all('a', href=True):`
			`video_links.add(link['href'])`
			`return video_links`

			`# 获取视频弹幕API链接`
			`def get_danmaku_api_links(video_url):`
			`response = requests.get(video_url, headers=headers)`
			`if response.status_code == 200:`
			`html_data = response.text`
			`return extract_danmaku_api_links(html_data)`
			`return []`

			`# 提取弹幕API链接`
			`def extract_danmaku_api_links(html_data):`
			`pattern = re.compile(r'https://api\.bilibili\.com/x/v1/dm/list\.so\?')`
			`return [url for url in html_data.split('\n') if pattern.match(url)]`

			`# 主程序`
			`def main():`
			`ai_comments = [] # 存储AI相关的弹幕`
			`danmaku_api_links = [] # 存储弹幕API链接`

			`# 循环获取视频链接`
			`for page in range(1, 8):`
			`video_links = extract_video_links(fetch_html(f"https://search.bilibili.com/video?keyword=%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=2&page={page}"))`
			`for video_link in video_links:`
			`danmaku_api_links.extend(get_danmaku_api_links(video_link))`

			`# 获取弹幕数据并筛选AI相关的弹幕`
			`for danmaku_api in danmaku_api_links:`
			`html_data = fetch_html(danmaku_api)`
			`soup = BeautifulSoup(html_data, 'html.parser')`
			`content_list = [danmaku['content'] for danmaku in soup.find_all('d')]`
			`for content in content_list:`
			`if check_ai_content(content):`
			`ai_comments.append(content)`

			`# 输出AI相关的弹幕数量`
			`print(f"AI相关的弹幕数量: {len(ai_comments)}")`

			`# 将数据转换为DataFrame`
			`df = pd.DataFrame(ai_comments, columns=['弹幕'])`
			`# 写入Excel文件`
			`df.to_excel('ai_comments.xlsx', index=False)`

			`# 绘制词云图`
			`ai_str = ' '.join(ai_comments)`
			`wc = WordCloud(width=500, height=500, background_color='white', font_path='msyh.ttc')`
			`wc.generate(ai_str)`
			`wc.to_file('ai_wordcloud.png')`

			`if __name__ == "__main__":`
			`main()`