AnalysisandMiningofBulletCo.../main.py

import sys
import io
import time
import random
import os
import warnings
import concurrent.futures
import cProfile
import pstats
from collections import Counter
from bs4 import XMLParsedAsHTMLWarning
warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning)
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
import requests
import jieba
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry


# 配置与会话
LATEST_COOKIES = {
    "_uuid": "19863C108-86AA-749A-B6109-467D313F610DA89057infoc",
    "b_lsid": "72611EAE_19A869A3F71",
    "b_nut": "1750739388",
    "bili_jct": "1f059040f95b75b7a38762995189f81e",
    "bili_ticket": "eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3NjM0NTE0NTAsImlhdCI6MTc2MzE5MjE5MCwicGx0IjotMX0.FO8oRZK6ZtwGkHZc-NC48kJbBxCo7tEeSvq5YxZpuAg",
    "bili_ticket_expires": "1763451390",
    "buvid3": "8E7992AB-88DC-C3F3-9906-629AB81924B588319infoc",
    "buvid4": "790AE183-11E7-FFDE-8FD5-0C57D524B62589075-025062412-LJ/KMBh5dbu+dWI+zTfBuQ%3D%3D",
    "DedeUserID": "341882542",
    "DedeUserID__ckMd5": "40c6b5fa2d4c8265",
    "SESSDATA": "9eaeb25f%2C1766291503%2Cd8726%2A62CjBfu020dj3EhPNw4mTBS-Qwc8c-SoJ0TdXVevi93oilOeDV2Z1SkDPQ3tXd05-oBwkSVnpBMDlBaFZ3cmdkSFdtTGhaNHJ5a1o4NWlaaG9nUlZpSlJOTmFObTBQMVlqWFhqZE1Melp6QWJpN1BEX1YzMTN2NlhaTDJrbld2aGs0ei14OXhYd2l3IIEC",
    "PVID": "1"
}

def create_session():
    session = requests.Session()
    retry = Retry(total=2, backoff_factor=0.5, status_forcelist=[429, 500, 502, 503, 504])
    adapter = HTTPAdapter(max_retries=retry)
    session.mount("http://", adapter)
    session.mount("https://", adapter)
    return session

global_session = create_session()


# 1. 爬取模块
def fetch_danmakus(aid):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36",
        "Referer": "https://www.bilibili.com/"
    }

    try:
        cid_resp = global_session.get(
            f"https://api.bilibili.com/x/web-interface/view?aid={aid}",
            headers=headers,
            cookies=LATEST_COOKIES,
            timeout=8
        )
        cid = cid_resp.json().get("data", {}).get("cid")
        if not cid:
            return []

        danmaku_resp = global_session.get(
            f"https://comment.bilibili.com/{cid}.xml",
            headers=headers,
            cookies=LATEST_COOKIES,
            timeout=8
        )
        danmaku_resp.encoding = "utf-8"
        soup = BeautifulSoup(danmaku_resp.text, "lxml-xml")
        return [d.text.strip() for d in soup.find_all("d") if d.text.strip()]

    except Exception as e:
        print(f"aid={aid} 爬取失败：{str(e)[:20]}")
        return []


def get_top_videos_aids(keyword, max_videos=120):
    aids = []
    page = 1
    page_size = 30
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36"}

    while len(aids) < max_videos and page <= 5:
        params = {
            "keyword": keyword,
            "page": page,
            "page_size": page_size,
            "search_type": "video",
            "order": "totalrank"
        }
        try:
            time.sleep(1.5 + random.random())
            resp = global_session.get(
                "https://api.bilibili.com/x/web-interface/search/type",
                params=params,
                headers=headers,
                cookies=LATEST_COOKIES,
                timeout=10
            )
            data = resp.json()
            if data.get("code") != 0:
                break
            video_list = data.get("data", {}).get("result", [])
            if not video_list:
                break
            new_aids = [str(v.get("aid")) for v in video_list if v.get("aid")]
            aids.extend(new_aids)
            aids = list(set(aids))
            print(f"关键词[{keyword}]页{page}，累计AID：{len(aids)}/{max_videos}")
            page += 1
        except Exception:
            page += 1
            continue
    return aids[:max_videos]


# 2. 过滤与保存
def filter_spam_danmakus(all_danmakus):
    if not all_danmakus:
        return []

    total = len(all_danmakus)
    counter = Counter(all_danmakus)
    min_count = max(5, int(total * 0.005)) if total > 0 else 5
    spam_content = {content for content, count in counter.items() if count >= min_count}
    filtered = [d for d in all_danmakus if d not in spam_content]
    print(f"\n已过滤{len(all_danmakus) - len(filtered)}条刷屏弹幕（剩余{len(filtered)}条）")
    return filtered


def save_all_danmakus(all_danmakus):
    if not all_danmakus:
        print("无弹幕可保存")
        return
    try:
        with open("all_danmakus.txt", "w", encoding="utf-8") as f:
            f.write("\n".join(all_danmakus))
        print(f"全部弹幕列表已保存至：all_danmakus.txt（共{len(all_danmakus)}条）")
    except Exception as e:
        print(f"保存全部弹幕失败：{e}")


# 3. 词云与统计
def generate_ai_wordcloud(filtered_danmakus):
    if not filtered_danmakus:
        print("词云生成失败：无有效弹幕数据")
        return

    text = " ".join(filtered_danmakus)
    if len(text) < 100:
        print("警告：有效文本过短，词云可能不完整")

    try:
        jieba.add_word("大模型")
        jieba.add_word("生成式AI")
        cut_text = " ".join(jieba.cut(text, HMM=False))
    except Exception as e:
        print(f"分词失败，使用原始文本：{e}")
        cut_text = text

    stopwords = STOPWORDS.union({"的", "了", "是", "在", "吗", "呢", "啊"})

    font_candidates = [
        "C:/Windows/Fonts/simhei.ttf",
        "C:/Windows/Fonts/msyh.ttc",
        "/System/Library/Fonts/PingFang.ttc",
        "/usr/share/fonts/opentype/noto/NotoSansCJK-Regular.ttc"
    ]
    font_path = None
    for candidate in font_candidates:
        if os.path.exists(candidate):
            font_path = candidate
            break
    if not font_path:
        font_path = None
        print("警告：未找到中文字体，可能显示乱码")

    try:
        wc = WordCloud(
            font_path=font_path,
            width=1200,
            height=800,
            background_color="white",
            max_words=150,
            stopwords=stopwords,
            prefer_horizontal=0.9,
            collocations=False
        ).generate(cut_text)

        plt.figure(figsize=(12, 8), dpi=200)
        plt.imshow(wc, interpolation="bilinear")
        plt.axis("off")
        plt.tight_layout(pad=0)
        plt.savefig("ai_tech_wordcloud.png", dpi=300, bbox_inches="tight")
        print("词云图已成功生成：ai_tech_wordcloud.png")
        plt.close()
    except Exception as e:
        print(f"词云生成失败：{e}")
        try:
            wc = WordCloud(background_color="white").generate(cut_text)
            wc.to_file("ai_tech_wordcloud_simple.png")
            print("已生成简化版词云：ai_tech_wordcloud_simple.png")
        except:
            print("无法生成词云，请检查文本数据")


def stat_ai_danmakus(filtered_danmakus):
    ai_keywords = {
        "生成", "对话", "编程", "模型", "训练", "微调", "医疗", "教育",
        "金融", "推理", "部署", "参数", "API", "多模态", "效率"
    }
    ai_danmakus = [d for d in filtered_danmakus if any(kw in d for kw in ai_keywords)]

    if not ai_danmakus:
        print("无有效AI技术应用弹幕")
        return

    top8 = Counter(ai_danmakus).most_common(8)
    print("\n===== AI技术应用弹幕前8名 =====")
    for i, (content, count) in enumerate(top8, 1):
        print(f"{i}. {content} → 出现{count}次")
    pd.DataFrame(top8, columns=["弹幕内容", "出现次数"]).to_excel("ai_danmakus_top8.xlsx", index=False)


# 主函数
def main():
    keywords = ["大语言模型", "大模型", "LLM"]
    all_danmakus = []

    for keyword in keywords:
        print(f"\n===== 爬取关键词：{keyword} =====")
        aids = get_top_videos_aids(keyword, max_videos=120)
        if not aids:
            print(f"无{keyword}视频，跳过")
            continue

        with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
            results = executor.map(fetch_danmakus, aids)
            for danmakus in results:
                all_danmakus.extend(danmakus)

        print(f"{keyword}处理完成，累计弹幕：{len(all_danmakus)}条")

    save_all_danmakus(all_danmakus)
    filtered_danmakus = filter_spam_danmakus(all_danmakus)
    generate_ai_wordcloud(filtered_danmakus)
    stat_ai_danmakus(filtered_danmakus)


# 性能分析入口
if __name__ == "__main__":
    profiler = cProfile.Profile()
    profiler.enable()
    main()
    profiler.disable()

    print("\n===== 性能分析结果 =====")
    stats = pstats.Stats(profiler)
    # 兼容不同Python版本的排序方式
    try:
        stats.sort_stats(pstats.SortKey.TOTAL)
    except AttributeError:
        stats.sort_stats('tottime')
    stats.print_stats(20)
    stats.dump_stats("performance_profile.prof")
    print("\n完整性能分析报告已保存至：performance_profile.prof")
    print("可使用 'snakeviz performance_profile.prof' 可视化分析结果")