complete basic function

6 months ago · b747054991
parent af79ad9d01
commit b747054991
18 changed files with 5834 additions and 0 deletions
--- a/ai_danmu_stylecloud.png
+++ b/ai_danmu_stylecloud.png
--- a/danmu_statistics.xlsx
+++ b/danmu_statistics.xlsx
--- a/main.py
+++ b/main.py
@ -0,0 +1,20 @@
+from tool.scrawl import Scawler
+import tool.word_filter as static
+import tool.keywords as keywords
+import tool.csv_parse as cp
+from tool.static import analyze_danmu_statistics
+from tool.cloud_show import Cloud_shower
+
+if __name__ == "__main__":
+    # scawler = Scawler()
+    # ls = scawler.work(1, 40)
+    # with open('./raw_danmu.txt', 'w', encoding='utf-8') as f:
+    #     for danmu in ls:
+    #         f.write(danmu + '\n')
+    ls = []
+    with open('./raw_danmu.txt', 'r', encoding='utf-8') as f:
+        ls = [line.strip() for line in f.readlines() if line.strip()]
+    danmu_counter = analyze_danmu_statistics(ls)
+    shower = Cloud_shower()
+    shower.to_show(danmu_counter)
+    print("词云生成完毕，保存为 ai_danmu_stylecloud.png")
--- a/raw_danmu.txt
+++ b/raw_danmu.txt
--- a/tool/pycache/cloud_show.cpython-311.pyc
+++ b/tool/pycache/cloud_show.cpython-311.pyc
--- a/tool/pycache/csv_parse.cpython-311.pyc
+++ b/tool/pycache/csv_parse.cpython-311.pyc
--- a/tool/pycache/keywords.cpython-311.pyc
+++ b/tool/pycache/keywords.cpython-311.pyc
--- a/tool/pycache/scrawl.cpython-311.pyc
+++ b/tool/pycache/scrawl.cpython-311.pyc
--- a/tool/pycache/static.cpython-311.pyc
+++ b/tool/pycache/static.cpython-311.pyc
--- a/tool/pycache/static_keyword.cpython-311.pyc
+++ b/tool/pycache/static_keyword.cpython-311.pyc
--- a/tool/pycache/word_filter.cpython-311.pyc
+++ b/tool/pycache/word_filter.cpython-311.pyc
--- a/tool/cloud_show.py
+++ b/tool/cloud_show.py
@ -0,0 +1,53 @@
+from stylecloud import gen_stylecloud
+
+
+class Cloud_shower:
+
+    def __init__(self):
+        pass
+
+    def to_show(self, sample_danmu_data_frq):
+        # 使用stylecloud生成词云
+        text_list = []
+        for word, freq in sample_danmu_data_frq.items():
+            text_list.extend([word] * freq)
+
+        text = " ".join(text_list)
+        gen_stylecloud(
+            text=text,  # 处理好的文本
+            size=1024,  # 图片尺寸，越大越清晰
+            font_path='msyh.ttc',  # 指定中文字体路径（如微软雅黑）
+            output_name='ai_danmu_stylecloud.png',  # 输出文件名
+            icon_name='fas fa-question-circle',
+            custom_stopwords=['的', '了', '在', '是', '我', '有', '和', '机'],  # 自定义停用词
+            palette='colorbrewer.qualitative.Set1_8',  # 使用预设配色方案
+            # background_color='white',  # 背景色
+            gradient='horizontal',  # 颜色渐变方向
+            max_font_size=200,  # 最大字体大小
+            max_words=500,  # 最多显示词数
+        )
+
+
+if __name__ == "__main__":
+    import tool.csv_parse as cp
+    # 使用示例
+    data_map = cp.read_csv_to_list('./result.csv')
+    print(data_map)
+
+    shower = Cloud_shower()
+    sample_danmu_data = [
+        "AI技术真厉害",
+        "大模型应用广泛",
+        "深度学习",
+        "神经网络",
+        "机器学习",
+        "AI技术真厉害",
+        "自然语言处理",
+        "计算机视觉",
+        "大模型应用广泛",
+        "强化学习",
+        "AI技术真厉害",
+        "生成式AI",
+    ]
+    shower.to_show(data_map)
+    print("词云生成完毕，保存为 ai_danmu_stylecloud.png")
--- a/tool/csv_parse.py
+++ b/tool/csv_parse.py
@ -0,0 +1,61 @@
+import palettable
+import csv
+from typing import List, Tuple
+
+
+def read_csv_to_list(csv_file: str) -> List[Tuple[str, int]]:
+    """
+    从CSV文件读取数据并转换为list[tuple[str, int]]
+    
+    Args:
+        csv_file: CSV文件路径
+        
+    Returns:
+        List[Tuple[str, int]]: 包含(内容, 频率)的列表
+    """
+    result = {}
+    try:
+        with open(csv_file, 'r', encoding='utf-8') as file:
+            # 创建CSV读取器
+            csv_reader = csv.reader(file)
+
+            # 跳过标题行（如果有）
+            headers = next(csv_reader, None)
+
+            for row in csv_reader:
+                content = row[0].strip()  # 内容列
+                frequency = int(row[1])  # 频率列（转换为整数）
+                result[content] = frequency
+
+    except FileNotFoundError:
+        print(f"错误：文件 {csv_file} 未找到")
+    except ValueError as e:
+        print(f"错误：频率值转换失败 - {e}")
+    except Exception as e:
+        print(f"读取文件时发生错误：{e}")
+
+    return result
+
+
+def trans_list_to_csv(ls: List[Tuple[str, int]], csv_file: str) -> None:
+    """
+    将list[tuple[str, int]]转换并保存为CSV文件
+    
+    Args:
+        ls: 包含(内容, 频率)的列表
+    """
+    try:
+        with open(csv_file, 'w', encoding='utf-8', newline='') as file:
+            csv_writer = csv.writer(file)
+
+            # 写入标题行
+            csv_writer.writerow(['内容', '频率'])
+
+            # 写入数据行
+            for content, frequency in ls:
+                csv_writer.writerow([content.replace(",", "_"), frequency])
+
+        print(f"数据已成功保存到 {csv_file}")
+
+    except Exception as e:
+        print(f"写入文件时发生错误：{e}")
--- a/tool/keywords.py
+++ b/tool/keywords.py
@ -0,0 +1,309 @@
+# 创建关键词文件
+keywords = [
+    # 基础术语
+    'AI',
+    '人工智能',
+    '机器学习',
+    '深度学习',
+    '神经网络',
+    '自然语言处理',
+    'NLP',
+    '计算机视觉',
+    'CV',
+    '大模型',
+    'GPT',
+    'ChatGPT',
+    '文心一言',
+    '通义千问',
+    '智谱',
+    '讯飞星火',
+    '强化学习',
+    '生成式AI',
+    'AIGC',
+    'LLM',
+    '扩散模型',
+    'Transformer',
+    'BERT',
+    'Stable Diffusion',
+    'Midjourney',
+    'DALL-E',
+    'AI绘画',
+    'AI写作',
+    'AI编程',
+    '自动驾驶',
+    '智能助手',
+    '语音识别',
+    '图像识别',
+    '目标检测',
+    '语义分割',
+    '数据挖掘',
+    '知识图谱',
+    '推荐系统',
+
+    # 大模型相关
+    'GPT-3',
+    'GPT-4',
+    'GPT-5',
+    'ChatGLM',
+    'LLaMA',
+    'Vicuna',
+    'Alpaca',
+    'Bloom',
+    'T5',
+    'BART',
+    'ERNIE',
+    'Claude',
+    'Gemini',
+    'PaLM',
+    'LaMDA',
+    'Codex',
+    'Copilot',
+    'InstructGPT',
+    'Sparrow',
+    'Gopher',
+    'Chinchilla',
+    'Jurassic',
+    'WuDao',
+    'PanGu',
+    'PLUG',
+    'M6',
+    'CPM',
+    'EVA',
+    'CogView',
+
+    # 多模态AI
+    '多模态',
+    '视觉语言模型',
+    'VLM',
+    'DALL-E2',
+    'DALL-E3',
+    'Imagen',
+    'Parti',
+    'CogVideo',
+    'Make-A-Video',
+    'Phenaki',
+    'NUWA',
+    'CogView2',
+    'CogView3',
+    '文生图',
+    '图生文',
+    '文生视频',
+    '语音合成',
+    'TTS',
+    'ASR',
+    '语音克隆',
+
+    # 技术架构
+    '注意力机制',
+    '自注意力',
+    '多头注意力',
+    '编码器',
+    '解码器',
+    '预训练',
+    '微调',
+    '提示工程',
+    'Prompt',
+    '思维链',
+    'CoT',
+    '零样本学习',
+    '小样本学习',
+    '指令调优',
+    'RLHF',
+    '人类反馈强化学习',
+    '对齐',
+    '缩放定律',
+    '涌现能力',
+
+    # 应用场景
+    '智能客服',
+    '聊天机器人',
+    '虚拟人',
+    '数字人',
+    'AI主播',
+    '内容生成',
+    '代码生成',
+    '智能编程',
+    '低代码',
+    '无代码',
+    '智能文档',
+    'RAG',
+    '检索增强',
+    'AI搜索',
+    '智能问答',
+    '知识库',
+    '智能诊断',
+    'AI制药',
+    'AI金融',
+    '量化交易',
+    '风险控制',
+    '智能投顾',
+    'AI教育',
+    '个性化学习',
+    '智慧城市',
+    '智能交通',
+    '工业AI',
+    '预测性维护',
+    '质量检测',
+    'AI农业',
+    '精准农业',
+    '智能家居',
+
+    # 工具框架
+    'TensorFlow',
+    'PyTorch',
+    'Keras',
+    'Hugging Face',
+    'Transformers库',
+    'Diffusers',
+    'LangChain',
+    'LlamaIndex',
+    'AutoGPT',
+    'BabyAGI',
+    'OpenAI',
+    'Anthropic',
+    'Google AI',
+    'Microsoft AI',
+    'Meta AI',
+    '百度AI',
+    '阿里云',
+    '腾讯云',
+    '华为云',
+    '讯飞开放平台',
+
+    # 技术概念
+    '监督学习',
+    '无监督学习',
+    '半监督学习',
+    '自监督学习',
+    '迁移学习',
+    '元学习',
+    '联邦学习',
+    '图神经网络',
+    'GNN',
+    '胶囊网络',
+    '神经图灵机',
+    '生成对抗网络',
+    'GAN',
+    '变分自编码器',
+    'VAE',
+    '流模型',
+    '标准化流',
+    '贝叶斯深度学习',
+    '可解释AI',
+    'XAI',
+    '因果推断',
+    '鲁棒性',
+    '公平性',
+
+    # 新兴方向
+    '具身智能',
+    'AGI',
+    '通用人工智能',
+    '超级智能',
+    'AI安全',
+    '对齐问题',
+    'AI伦理',
+    'AI治理',
+    'AI法规',
+    'AI for Science',
+    '科学智能',
+    'AlphaFold',
+    '天气预报',
+    '气候模拟',
+    '蛋白质设计',
+    '材料发现',
+    'AI辅助创作',
+
+    # 中文特色
+    '文心一言',
+    '通义千问',
+    '讯飞星火',
+    '腾讯混元',
+    '字节豆包',
+    '智谱AI',
+    '月之暗面',
+    '深度求索',
+    '零一万物',
+    '百川智能',
+    '昆仑万维',
+    '幻方AI',
+    '上海AI实验室',
+    '北京智源',
+    '之江实验室'
+]
+# 小关键词验证
+# # ai_keywords = [
+#     'AI', '人工智能', '机器学习', '深度学习', '神经网络', '自然语言处理', 'NLP', '计算机视觉', 'CV', '大模型', 'GPT', 'ChatGPT', '文心一言', '通义千问',
+#     '智谱', '讯飞星火', '强化学习', '生成式AI', 'AIGC', 'LLM', '扩散模型', 'Transformer', 'BERT', 'Stable Diffusion', 'Midjourney',
+#     'DALL-E', 'AI绘画', 'AI写作', 'AI编程', '自动驾驶', '智能助手', '语音识别', '图像识别', '目标检测', '语义分割', '数据挖掘', '知识图谱', '推荐系统'
+# ]
+
+# 去重并排序
+colloquial_patterns = [
+    # 疑问语气词
+    r'我\?*'
+    r'是不是',
+    r'吗\?*',
+    r'呢\?*',
+    r'吧\?*',
+    r'啊\?*',
+    r'呀\?*',
+    r'啦\?*',
+    # 感叹语气词
+    r'啊!*',
+    r'呀!*',
+    r'啦!*',
+    r'哇!*',
+    r'哦!*',
+    r'哟!*',
+    r'诶!*',
+    # 口语化表达
+    r'^啊\s+',
+    r'^呃\s+',
+    r'^嗯\s+',
+    r'^哼\s+',
+    r'^哈\s+',
+    r'^嘿\s+',
+    r'牢大',
+    r'hhh+',
+    r'哈哈+',
+    r'嘻嘻+',
+    r'嘿嘿+',
+    r'呵呵+',
+    r'太NB了',
+    r'牛啊',
+    r'牛啊',
+    r'太厉害了吧',
+    r'卧槽',
+    r'我靠',
+    r'啥\?*',
+    r'咋\s+',
+    r'啥意思',
+    r'怎么回事',
+    # 其他口语化表达
+    r'我的妈呀',
+    r'天啊',
+    r'上帝',
+    r'佛祖',
+    r'妈呀',
+    r'我晕',
+    r'我吐了',
+    r'我裂开了',
+    r'我傻了',
+    r'我惊了',
+    r'救命',
+    r'要命',
+    r'完蛋',
+    r'糟糕',
+    r'该死'
+]
+keywords = sorted(list(set(keywords)))
+colloquial_patterns = sorted(list(set(colloquial_patterns)))
+
+if __name__ == "__main__":
+    # 保存到文件
+    with open('keywords.txt', 'w', encoding='utf-8') as f:
+        for keyword in keywords:
+            f.write(keyword + '\n')
+
+    print(f"已创建关键词文件，包含 {len(keywords)} 个AI相关关键词")
--- a/tool/scrawl.py
+++ b/tool/scrawl.py
@ -0,0 +1,117 @@
+import re, requests, bs4, time
+
+
+class Scawler:
+
+    def __init__(self):
+        self.url_ref = 'https://search.bilibili.com/all?vt=83547368&keyword=LLM'
+        self.headers = {
+            # Referer 防盗链 告诉服务器你请求链接是从哪里跳转过来的
+            # "Referer": "https://www.bilibili.com/video/BV1454y187Er/",
+            "Referer":
+            self.url_ref,
+            # User-Agent 用户代理, 表示浏览器/设备基本身份信息
+            "User-Agent":
+            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
+        }
+        self.url_interface_cid = "https://api.bilibili.com/x/v1/dm/list.so?oid={cid}"
+        self.url_page_base = "https://search.bilibili.com/all?vt=85151086&keyword=LLM&page={page}&o={offset}"
+
+    #   从视频主页获取cid
+    def get_cid(self, url):
+        response = requests.get(url, headers=self.headers, timeout=10)
+        response.encoding = 'utf-8'
+        text = response.text
+        pattern = r'<title>(.*?)</title>'
+        match = re.search(pattern, text)
+
+        if match:
+            title_content = match.group(1)
+            # print(f"title_content: {title_content}")
+
+        match = re.search(r'"cid":(\d+)', text)
+        if match:
+            return match.group(1)
+        return None
+
+    #  根据b站接口从cid获取弹幕内容
+    def get_from_cid(self, cid):
+        self.url_cid = self.url_interface_cid.format(cid=cid)
+        response = requests.get(self.url_cid, headers=self.headers, timeout=10)
+        response.encoding = 'utf-8'
+        text = response.text
+
+        def get_parse_list(text):
+            danmaku_list = []
+            pattern = r'<d p="([^"]*)">([^<]*)</d>'
+            matches = re.findall(pattern, text)
+            for match in matches:
+                params = match[0].split(',')
+                danmaku = {
+                    'time': float(params[0]),  # 出现时间
+                    'type': int(params[1]),  # 弹幕类型
+                    'size': int(params[2]),  # 字体大小
+                    'color': int(params[3]),  # 颜色
+                    'timestamp': int(params[4]),  # 发送时间
+                    'pool': int(params[5]),  # 弹幕池
+                    'uid': params[6],  # 用户ID
+                    'id': params[7],  # 弹幕ID
+                    'text': match[1]  # 弹幕内容
+                }
+                danmaku_list.append(danmaku)
+            return [i['text'] for i in danmaku_list]
+
+        # print(get_parse_list(text))
+        danmaku_list = get_parse_list(text)
+        return danmaku_list
+
+    def get_html(self, page):
+        page -= 1
+        url_base = self.url_page_base.format(page=page, offset=page * 30)
+        response = requests.get(url_base, headers=self.headers, timeout=10)
+        response.encoding = 'utf-8'
+        return response.text
+
+    def parse_html(self, html, num):
+        soup = bs4.BeautifulSoup(html, 'html.parser')
+        danmaku_list = []
+        for i in range(num):
+            selector = f"#i_cecream > div > div:nth-child(2) > div.search-content.search-content--gray > div > div > div > div.video.i_wrapper.search-all-list > div > div:nth-child({i+1}) > div > div.bili-video-card__wrap > a"
+            element = soup.select_one(selector)
+            if element:
+                # 提取链接地址（href属性）
+                link_url = element.get('href')
+                # 提取元素内文本内容
+                link_text = element.get_text(strip=True)  # strip=True去除首尾空白
+                link_url = link_url.replace("//", "https://")
+                print("提取结果：")
+                print(f"链接地址: {link_url}")
+                print(f"链接文本: {link_text}")
+                cid = self.get_cid(link_url)
+                print(f"得到CID: {cid}")
+                danmaku_list.extend(self.get_from_cid(cid))
+            else:
+                print("未找到匹配的元素。")
+                pass
+            time.sleep(0.5)
+        print("弹幕获取完成")
+        return danmaku_list
+
+    def work(self, page, num):
+        html = self.get_html(page)
+        ls = self.parse_html(html, num)
+        return ls
+
+
+if __name__ == "__main__":
+    # import cProfile as pf
+    # from pstats import SortKey as sk
+    # import pstats
+    # p = pstats.Stats("profiler_stats")
+    # p.strip_dirs().sort_stats(sk.TIME).print_stats(30)
+
+    scrawl = Scawler()
+    scrawl.work(1, 2)
+    # pf.run("scrawl.work(1, 2)", "profiler_stats")
+
+    pass
--- a/tool/static.py
+++ b/tool/static.py
@ -0,0 +1,61 @@
+import pandas as pd
+from collections import Counter
+import logging
+
+from tool.word_filter import DanmakuFilter
+
+
+def analyze_danmu_statistics(danmu_list, top_n=8, output_file='danmu_statistics.xlsx'):
+    """
+    统计弹幕数据并导出到Excel
+    
+    参数:
+    danmu_list: 弹幕字符串列表
+    top_n: 需要统计的前N名弹幕数量
+    output_file: 输出的Excel文件名
+    """
+    try:
+        danmaku_filter = DanmakuFilter()
+        danmaku_counter = Counter(danmu_list)
+        filtered_counter = danmaku_filter.filter_danmaku(danmaku_counter)
+
+        # 获取前top_n个最常见的弹幕
+        top_danmus = filtered_counter.most_common(top_n)
+
+        # 准备数据框
+        data = []
+        for rank, (danmu, count) in enumerate(top_danmus, 1):
+            data.append({
+                '排名': rank,
+                '弹幕内容': danmu,
+                '出现次数': count,
+            })
+        # 创建DataFrame
+        df = pd.DataFrame(data)
+        print(df.to_string(index=False))
+        # 导出到Excel
+        df.to_excel(output_file, index=False)
+        return filtered_counter
+
+    except Exception as e:
+        print(f"统计过程中发生错误: {e}")
+        return None
+
+
+# 使用示例
+if __name__ == "__main__":
+    # 示例数据 - 这里替换为您实际的弹幕数据
+    sample_danmu_data = [
+        "AI技术真厉害", "大模型应用广泛", "深度学习", "神经网络", "机器学习", "AI技术真厉害", "自然语言处理", "计算机视觉", "大模型应用广泛", "强化学习", "AI技术真厉害",
+        "生成式AI", "深度学习", "大模型应用广泛", "Transformer", "AI技术真厉害"
+    ]
+
+    # 基本统计功能
+    print("=== 基本统计功能 ===")
+    result = analyze_danmu_statistics(sample_danmu_data, top_n=8)
+
+    if result is not None:
+        print("\n统计结果预览:")
+        print(result)
+    # 详细分析报告
+    print("\n=== 生成详细分析报告 ===")
--- a/tool/word_filter.py
+++ b/tool/word_filter.py
@ -0,0 +1,50 @@
+import re
+from typing import List, Tuple
+from collections import Counter
+
+
+class DanmakuFilter:
+    """弹幕过滤器，去除口语化内容，保留专业讨论"""
+
+    def __init__(self):
+        # 定义语气词和口语化表达
+        import tool.keywords as kw
+        # 编译正则表达式
+        self.colloquial_regex = re.compile('|'.join(kw.colloquial_patterns))
+        self.llm_regex = re.compile('|'.join(kw.keywords), re.IGNORECASE)
+
+    def is_colloquial(self, text: str) -> bool:
+        """判断文本是否包含口语化表达"""
+        # 检查是否包含语气词
+        if self.colloquial_regex.search(text):
+            return True
+
+        # 检查是否以语气词开头
+        # if re.match(r'^[啊呀呢吧哦哟诶呃嗯哈哼]+\s*', text):
+        #     return True
+
+        # 检查是否包含过多的感叹号或问号（表示强烈情绪）
+        if text.count('!') + text.count('？') > 2:
+            return True
+        if len(text) >= 14: return True
+
+        return False
+
+    def is_llm_related(self, text: str) -> bool:
+        """判断文本是否包含LLM相关关键词"""
+        return bool(self.llm_regex.search(text))
+
+    def filter_danmaku(self, danmaku_map: Counter[str, int]) -> Counter[str, int]:
+        """过滤弹幕，保留LLM相关的专业讨论"""
+        filtered = Counter()
+        for content, freq in danmaku_map.items():
+            # 跳过空内容
+            if not content or not content.strip():
+                continue
+
+            # 跳过口语化内容
+            if self.is_colloquial(content):
+                continue
+            if self.is_llm_related(content):
+                filtered.update({content: freq})
+        return filtered
--- a/tool/work2.code-workspace
+++ b/tool/work2.code-workspace
@ -0,0 +1,11 @@
+{
+	"folders": [
+		{
+			"path": ".."
+		},
+		{
+			"path": "../.."
+		}
+	],
+	"settings": {}
+}