complete basic function

main
zrj 6 months ago
parent af79ad9d01
commit b747054991

Binary file not shown.

After

Width:  |  Height:  |  Size: 91 KiB

Binary file not shown.

@ -0,0 +1,20 @@
from tool.scrawl import Scawler
import tool.word_filter as static
import tool.keywords as keywords
import tool.csv_parse as cp
from tool.static import analyze_danmu_statistics
from tool.cloud_show import Cloud_shower
if __name__ == "__main__":
# scawler = Scawler()
# ls = scawler.work(1, 40)
# with open('./raw_danmu.txt', 'w', encoding='utf-8') as f:
# for danmu in ls:
# f.write(danmu + '\n')
ls = []
with open('./raw_danmu.txt', 'r', encoding='utf-8') as f:
ls = [line.strip() for line in f.readlines() if line.strip()]
danmu_counter = analyze_danmu_statistics(ls)
shower = Cloud_shower()
shower.to_show(danmu_counter)
print("词云生成完毕,保存为 ai_danmu_stylecloud.png")

File diff suppressed because it is too large Load Diff

@ -0,0 +1,53 @@
from stylecloud import gen_stylecloud
class Cloud_shower:
def __init__(self):
pass
def to_show(self, sample_danmu_data_frq):
# 使用stylecloud生成词云
text_list = []
for word, freq in sample_danmu_data_frq.items():
text_list.extend([word] * freq)
text = " ".join(text_list)
gen_stylecloud(
text=text, # 处理好的文本
size=1024, # 图片尺寸,越大越清晰
font_path='msyh.ttc', # 指定中文字体路径(如微软雅黑)
output_name='ai_danmu_stylecloud.png', # 输出文件名
icon_name='fas fa-question-circle',
custom_stopwords=['', '', '', '', '', '', '', ''], # 自定义停用词
palette='colorbrewer.qualitative.Set1_8', # 使用预设配色方案
# background_color='white', # 背景色
gradient='horizontal', # 颜色渐变方向
max_font_size=200, # 最大字体大小
max_words=500, # 最多显示词数
)
if __name__ == "__main__":
import tool.csv_parse as cp
# 使用示例
data_map = cp.read_csv_to_list('./result.csv')
print(data_map)
shower = Cloud_shower()
sample_danmu_data = [
"AI技术真厉害",
"大模型应用广泛",
"深度学习",
"神经网络",
"机器学习",
"AI技术真厉害",
"自然语言处理",
"计算机视觉",
"大模型应用广泛",
"强化学习",
"AI技术真厉害",
"生成式AI",
]
shower.to_show(data_map)
print("词云生成完毕,保存为 ai_danmu_stylecloud.png")

@ -0,0 +1,61 @@
import palettable
import csv
from typing import List, Tuple
def read_csv_to_list(csv_file: str) -> List[Tuple[str, int]]:
"""
从CSV文件读取数据并转换为list[tuple[str, int]]
Args:
csv_file: CSV文件路径
Returns:
List[Tuple[str, int]]: 包含(内容, 频率)的列表
"""
result = {}
try:
with open(csv_file, 'r', encoding='utf-8') as file:
# 创建CSV读取器
csv_reader = csv.reader(file)
# 跳过标题行(如果有)
headers = next(csv_reader, None)
for row in csv_reader:
content = row[0].strip() # 内容列
frequency = int(row[1]) # 频率列(转换为整数)
result[content] = frequency
except FileNotFoundError:
print(f"错误:文件 {csv_file} 未找到")
except ValueError as e:
print(f"错误:频率值转换失败 - {e}")
except Exception as e:
print(f"读取文件时发生错误:{e}")
return result
def trans_list_to_csv(ls: List[Tuple[str, int]], csv_file: str) -> None:
"""
将list[tuple[str, int]]转换并保存为CSV文件
Args:
ls: 包含(内容, 频率)的列表
"""
try:
with open(csv_file, 'w', encoding='utf-8', newline='') as file:
csv_writer = csv.writer(file)
# 写入标题行
csv_writer.writerow(['内容', '频率'])
# 写入数据行
for content, frequency in ls:
csv_writer.writerow([content.replace(",", "_"), frequency])
print(f"数据已成功保存到 {csv_file}")
except Exception as e:
print(f"写入文件时发生错误:{e}")

@ -0,0 +1,309 @@
# 创建关键词文件
keywords = [
# 基础术语
'AI',
'人工智能',
'机器学习',
'深度学习',
'神经网络',
'自然语言处理',
'NLP',
'计算机视觉',
'CV',
'大模型',
'GPT',
'ChatGPT',
'文心一言',
'通义千问',
'智谱',
'讯飞星火',
'强化学习',
'生成式AI',
'AIGC',
'LLM',
'扩散模型',
'Transformer',
'BERT',
'Stable Diffusion',
'Midjourney',
'DALL-E',
'AI绘画',
'AI写作',
'AI编程',
'自动驾驶',
'智能助手',
'语音识别',
'图像识别',
'目标检测',
'语义分割',
'数据挖掘',
'知识图谱',
'推荐系统',
# 大模型相关
'GPT-3',
'GPT-4',
'GPT-5',
'ChatGLM',
'LLaMA',
'Vicuna',
'Alpaca',
'Bloom',
'T5',
'BART',
'ERNIE',
'Claude',
'Gemini',
'PaLM',
'LaMDA',
'Codex',
'Copilot',
'InstructGPT',
'Sparrow',
'Gopher',
'Chinchilla',
'Jurassic',
'WuDao',
'PanGu',
'PLUG',
'M6',
'CPM',
'EVA',
'CogView',
# 多模态AI
'多模态',
'视觉语言模型',
'VLM',
'DALL-E2',
'DALL-E3',
'Imagen',
'Parti',
'CogVideo',
'Make-A-Video',
'Phenaki',
'NUWA',
'CogView2',
'CogView3',
'文生图',
'图生文',
'文生视频',
'语音合成',
'TTS',
'ASR',
'语音克隆',
# 技术架构
'注意力机制',
'自注意力',
'多头注意力',
'编码器',
'解码器',
'预训练',
'微调',
'提示工程',
'Prompt',
'思维链',
'CoT',
'零样本学习',
'小样本学习',
'指令调优',
'RLHF',
'人类反馈强化学习',
'对齐',
'缩放定律',
'涌现能力',
# 应用场景
'智能客服',
'聊天机器人',
'虚拟人',
'数字人',
'AI主播',
'内容生成',
'代码生成',
'智能编程',
'低代码',
'无代码',
'智能文档',
'RAG',
'检索增强',
'AI搜索',
'智能问答',
'知识库',
'智能诊断',
'AI制药',
'AI金融',
'量化交易',
'风险控制',
'智能投顾',
'AI教育',
'个性化学习',
'智慧城市',
'智能交通',
'工业AI',
'预测性维护',
'质量检测',
'AI农业',
'精准农业',
'智能家居',
# 工具框架
'TensorFlow',
'PyTorch',
'Keras',
'Hugging Face',
'Transformers库',
'Diffusers',
'LangChain',
'LlamaIndex',
'AutoGPT',
'BabyAGI',
'OpenAI',
'Anthropic',
'Google AI',
'Microsoft AI',
'Meta AI',
'百度AI',
'阿里云',
'腾讯云',
'华为云',
'讯飞开放平台',
# 技术概念
'监督学习',
'无监督学习',
'半监督学习',
'自监督学习',
'迁移学习',
'元学习',
'联邦学习',
'图神经网络',
'GNN',
'胶囊网络',
'神经图灵机',
'生成对抗网络',
'GAN',
'变分自编码器',
'VAE',
'流模型',
'标准化流',
'贝叶斯深度学习',
'可解释AI',
'XAI',
'因果推断',
'鲁棒性',
'公平性',
# 新兴方向
'具身智能',
'AGI',
'通用人工智能',
'超级智能',
'AI安全',
'对齐问题',
'AI伦理',
'AI治理',
'AI法规',
'AI for Science',
'科学智能',
'AlphaFold',
'天气预报',
'气候模拟',
'蛋白质设计',
'材料发现',
'AI辅助创作',
# 中文特色
'文心一言',
'通义千问',
'讯飞星火',
'腾讯混元',
'字节豆包',
'智谱AI',
'月之暗面',
'深度求索',
'零一万物',
'百川智能',
'昆仑万维',
'幻方AI',
'上海AI实验室',
'北京智源',
'之江实验室'
]
# 小关键词验证
# # ai_keywords = [
# 'AI', '人工智能', '机器学习', '深度学习', '神经网络', '自然语言处理', 'NLP', '计算机视觉', 'CV', '大模型', 'GPT', 'ChatGPT', '文心一言', '通义千问',
# '智谱', '讯飞星火', '强化学习', '生成式AI', 'AIGC', 'LLM', '扩散模型', 'Transformer', 'BERT', 'Stable Diffusion', 'Midjourney',
# 'DALL-E', 'AI绘画', 'AI写作', 'AI编程', '自动驾驶', '智能助手', '语音识别', '图像识别', '目标检测', '语义分割', '数据挖掘', '知识图谱', '推荐系统'
# ]
# 去重并排序
colloquial_patterns = [
# 疑问语气词
r'\?*'
r'是不是',
r'\?*',
r'\?*',
r'\?*',
r'\?*',
r'\?*',
r'\?*',
# 感叹语气词
r'啊!*',
r'呀!*',
r'啦!*',
r'哇!*',
r'哦!*',
r'哟!*',
r'诶!*',
# 口语化表达
r'^啊\s+',
r'^呃\s+',
r'^嗯\s+',
r'^哼\s+',
r'^哈\s+',
r'^嘿\s+',
r'牢大',
r'hhh+',
r'哈哈+',
r'嘻嘻+',
r'嘿嘿+',
r'呵呵+',
r'太NB了',
r'牛啊',
r'牛啊',
r'太厉害了吧',
r'卧槽',
r'我靠',
r'\?*',
r'\s+',
r'啥意思',
r'怎么回事',
# 其他口语化表达
r'我的妈呀',
r'天啊',
r'上帝',
r'佛祖',
r'妈呀',
r'我晕',
r'我吐了',
r'我裂开了',
r'我傻了',
r'我惊了',
r'救命',
r'要命',
r'完蛋',
r'糟糕',
r'该死'
]
keywords = sorted(list(set(keywords)))
colloquial_patterns = sorted(list(set(colloquial_patterns)))
if __name__ == "__main__":
# 保存到文件
with open('keywords.txt', 'w', encoding='utf-8') as f:
for keyword in keywords:
f.write(keyword + '\n')
print(f"已创建关键词文件,包含 {len(keywords)} 个AI相关关键词")

@ -0,0 +1,117 @@
import re, requests, bs4, time
class Scawler:
def __init__(self):
self.url_ref = 'https://search.bilibili.com/all?vt=83547368&keyword=LLM'
self.headers = {
# Referer 防盗链 告诉服务器你请求链接是从哪里跳转过来的
# "Referer": "https://www.bilibili.com/video/BV1454y187Er/",
"Referer":
self.url_ref,
# User-Agent 用户代理, 表示浏览器/设备基本身份信息
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
}
self.url_interface_cid = "https://api.bilibili.com/x/v1/dm/list.so?oid={cid}"
self.url_page_base = "https://search.bilibili.com/all?vt=85151086&keyword=LLM&page={page}&o={offset}"
# 从视频主页获取cid
def get_cid(self, url):
response = requests.get(url, headers=self.headers, timeout=10)
response.encoding = 'utf-8'
text = response.text
pattern = r'<title>(.*?)</title>'
match = re.search(pattern, text)
if match:
title_content = match.group(1)
# print(f"title_content: {title_content}")
match = re.search(r'"cid":(\d+)', text)
if match:
return match.group(1)
return None
# 根据b站接口从cid获取弹幕内容
def get_from_cid(self, cid):
self.url_cid = self.url_interface_cid.format(cid=cid)
response = requests.get(self.url_cid, headers=self.headers, timeout=10)
response.encoding = 'utf-8'
text = response.text
def get_parse_list(text):
danmaku_list = []
pattern = r'<d p="([^"]*)">([^<]*)</d>'
matches = re.findall(pattern, text)
for match in matches:
params = match[0].split(',')
danmaku = {
'time': float(params[0]), # 出现时间
'type': int(params[1]), # 弹幕类型
'size': int(params[2]), # 字体大小
'color': int(params[3]), # 颜色
'timestamp': int(params[4]), # 发送时间
'pool': int(params[5]), # 弹幕池
'uid': params[6], # 用户ID
'id': params[7], # 弹幕ID
'text': match[1] # 弹幕内容
}
danmaku_list.append(danmaku)
return [i['text'] for i in danmaku_list]
# print(get_parse_list(text))
danmaku_list = get_parse_list(text)
return danmaku_list
def get_html(self, page):
page -= 1
url_base = self.url_page_base.format(page=page, offset=page * 30)
response = requests.get(url_base, headers=self.headers, timeout=10)
response.encoding = 'utf-8'
return response.text
def parse_html(self, html, num):
soup = bs4.BeautifulSoup(html, 'html.parser')
danmaku_list = []
for i in range(num):
selector = f"#i_cecream > div > div:nth-child(2) > div.search-content.search-content--gray > div > div > div > div.video.i_wrapper.search-all-list > div > div:nth-child({i+1}) > div > div.bili-video-card__wrap > a"
element = soup.select_one(selector)
if element:
# 提取链接地址href属性
link_url = element.get('href')
# 提取元素内文本内容
link_text = element.get_text(strip=True) # strip=True去除首尾空白
link_url = link_url.replace("//", "https://")
print("提取结果:")
print(f"链接地址: {link_url}")
print(f"链接文本: {link_text}")
cid = self.get_cid(link_url)
print(f"得到CID: {cid}")
danmaku_list.extend(self.get_from_cid(cid))
else:
print("未找到匹配的元素。")
pass
time.sleep(0.5)
print("弹幕获取完成")
return danmaku_list
def work(self, page, num):
html = self.get_html(page)
ls = self.parse_html(html, num)
return ls
if __name__ == "__main__":
# import cProfile as pf
# from pstats import SortKey as sk
# import pstats
# p = pstats.Stats("profiler_stats")
# p.strip_dirs().sort_stats(sk.TIME).print_stats(30)
scrawl = Scawler()
scrawl.work(1, 2)
# pf.run("scrawl.work(1, 2)", "profiler_stats")
pass

@ -0,0 +1,61 @@
import pandas as pd
from collections import Counter
import logging
from tool.word_filter import DanmakuFilter
def analyze_danmu_statistics(danmu_list, top_n=8, output_file='danmu_statistics.xlsx'):
"""
统计弹幕数据并导出到Excel
参数:
danmu_list: 弹幕字符串列表
top_n: 需要统计的前N名弹幕数量
output_file: 输出的Excel文件名
"""
try:
danmaku_filter = DanmakuFilter()
danmaku_counter = Counter(danmu_list)
filtered_counter = danmaku_filter.filter_danmaku(danmaku_counter)
# 获取前top_n个最常见的弹幕
top_danmus = filtered_counter.most_common(top_n)
# 准备数据框
data = []
for rank, (danmu, count) in enumerate(top_danmus, 1):
data.append({
'排名': rank,
'弹幕内容': danmu,
'出现次数': count,
})
# 创建DataFrame
df = pd.DataFrame(data)
print(df.to_string(index=False))
# 导出到Excel
df.to_excel(output_file, index=False)
return filtered_counter
except Exception as e:
print(f"统计过程中发生错误: {e}")
return None
# 使用示例
if __name__ == "__main__":
# 示例数据 - 这里替换为您实际的弹幕数据
sample_danmu_data = [
"AI技术真厉害", "大模型应用广泛", "深度学习", "神经网络", "机器学习", "AI技术真厉害", "自然语言处理", "计算机视觉", "大模型应用广泛", "强化学习", "AI技术真厉害",
"生成式AI", "深度学习", "大模型应用广泛", "Transformer", "AI技术真厉害"
]
# 基本统计功能
print("=== 基本统计功能 ===")
result = analyze_danmu_statistics(sample_danmu_data, top_n=8)
if result is not None:
print("\n统计结果预览:")
print(result)
# 详细分析报告
print("\n=== 生成详细分析报告 ===")

@ -0,0 +1,50 @@
import re
from typing import List, Tuple
from collections import Counter
class DanmakuFilter:
"""弹幕过滤器,去除口语化内容,保留专业讨论"""
def __init__(self):
# 定义语气词和口语化表达
import tool.keywords as kw
# 编译正则表达式
self.colloquial_regex = re.compile('|'.join(kw.colloquial_patterns))
self.llm_regex = re.compile('|'.join(kw.keywords), re.IGNORECASE)
def is_colloquial(self, text: str) -> bool:
"""判断文本是否包含口语化表达"""
# 检查是否包含语气词
if self.colloquial_regex.search(text):
return True
# 检查是否以语气词开头
# if re.match(r'^[啊呀呢吧哦哟诶呃嗯哈哼]+\s*', text):
# return True
# 检查是否包含过多的感叹号或问号(表示强烈情绪)
if text.count('!') + text.count('') > 2:
return True
if len(text) >= 14: return True
return False
def is_llm_related(self, text: str) -> bool:
"""判断文本是否包含LLM相关关键词"""
return bool(self.llm_regex.search(text))
def filter_danmaku(self, danmaku_map: Counter[str, int]) -> Counter[str, int]:
"""过滤弹幕保留LLM相关的专业讨论"""
filtered = Counter()
for content, freq in danmaku_map.items():
# 跳过空内容
if not content or not content.strip():
continue
# 跳过口语化内容
if self.is_colloquial(content):
continue
if self.is_llm_related(content):
filtered.update({content: freq})
return filtered

@ -0,0 +1,11 @@
{
"folders": [
{
"path": ".."
},
{
"path": "../.."
}
],
"settings": {}
}
Loading…
Cancel
Save