complete analasy work effience by the cProfile

main
zrj 3 months ago
parent 93498468d7
commit 1e2aef5a4f

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 174 KiB

@ -4,17 +4,36 @@ import tool.keywords as keywords
import tool.csv_parse as cp
from tool.static import analyze_danmu_statistics
from tool.cloud_show import Cloud_shower
import cProfile
import pstats
from io import StringIO
if __name__ == "__main__":
# 爬虫性能分析部分
scawler = Scawler()
profiler = cProfile.Profile()
profiler.enable()
ls = scawler.work(1, 4)
danmu_counter = analyze_danmu_statistics(ls)
shower = Cloud_shower()
shower.to_show(danmu_counter)
profiler.disable()
stream = StringIO()
stats = pstats.Stats(profiler, stream=stream)
stats.sort_stats('cumulative') # 按累计时间排序
stats.print_stats(20) # 显示前20行
print("\n=== 详细分析报告 ===")
print(stream.getvalue())
#数据爬取部分
# scawler = Scawler()
# ls = scawler.work(1, 40)
# with open('./raw_danmu.txt', 'w', encoding='utf-8') as f:
# for danmu in ls:
# f.write(danmu + '\n')
ls = []
with open('./raw_danmu.txt', 'r', encoding='utf-8') as f:
ls = [line.strip() for line in f.readlines() if line.strip()]
danmu_counter = analyze_danmu_statistics(ls)
shower = Cloud_shower()
shower.to_show(danmu_counter)
print("词云生成完毕,保存为 ai_danmu_stylecloud.png")
# 数据处理部分
# ls = []
# with open('./raw_danmu.txt', 'r', encoding='utf-8') as f:
# ls = [line.strip() for line in f.readlines() if line.strip()]
# danmu_counter = analyze_danmu_statistics(ls)
# shower = Cloud_shower()
# shower.to_show(danmu_counter)

Binary file not shown.

File diff suppressed because it is too large Load Diff

@ -0,0 +1,45 @@
import cProfile
import pstats
from io import StringIO
def complex_calculation(n):
"""模拟复杂计算"""
result = 0
for i in range(n):
for j in range(i):
result += i * j
return result
def data_processing():
"""数据处理函数"""
data = [complex_calculation(i) for i in range(100, 200)]
return sum(data) / len(data)
def analyze_with_cprofile():
# 方法1: 使用cProfile.run()
profiler = cProfile.Profile()
profiler.enable()
# 执行要分析的代码
result = data_processing()
profiler.disable()
# 生成统计报告
stream = StringIO()
stats = pstats.Stats(profiler, stream=stream)
stats.sort_stats('cumulative') # 按累计时间排序
stats.print_stats(20) # 显示前20行
print("\n=== 详细分析报告 ===")
print(stream.getvalue())
return result
# 运行分析
analyze_with_cprofile()

@ -35,19 +35,5 @@ if __name__ == "__main__":
print(data_map)
shower = Cloud_shower()
sample_danmu_data = [
"AI技术真厉害",
"大模型应用广泛",
"深度学习",
"神经网络",
"机器学习",
"AI技术真厉害",
"自然语言处理",
"计算机视觉",
"大模型应用广泛",
"强化学习",
"AI技术真厉害",
"生成式AI",
]
shower.to_show(data_map)
print("词云生成完毕,保存为 ai_danmu_stylecloud.png")

@ -65,6 +65,7 @@ class Scawler:
danmaku_list = get_parse_list(text)
return danmaku_list
# 获取搜索结果页面的HTML内容
def get_html(self, page):
page -= 1
url_base = self.url_page_base.format(page=page, offset=page * 30)
@ -72,6 +73,7 @@ class Scawler:
response.encoding = 'utf-8'
return response.text
# 解析HTML内容提取视频链接并获取弹幕
def parse_html(self, html, num):
soup = bs4.BeautifulSoup(html, 'html.parser')
danmaku_list = []

@ -42,9 +42,7 @@ def analyze_danmu_statistics(danmu_list, top_n=8, output_file='danmu_statistics.
return None
# 使用示例
if __name__ == "__main__":
# 示例数据 - 这里替换为您实际的弹幕数据
sample_danmu_data = [
"AI技术真厉害", "大模型应用广泛", "深度学习", "神经网络", "机器学习", "AI技术真厉害", "自然语言处理", "计算机视觉", "大模型应用广泛", "强化学习", "AI技术真厉害",
"生成式AI", "深度学习", "大模型应用广泛", "Transformer", "AI技术真厉害"

@ -173,10 +173,6 @@ if __name__ == "__main__":
```
**改进思路:**
使用异步爬虫
**性能分析图:**
<div class ='image-gallery'>
@ -186,9 +182,28 @@ if __name__ == "__main__":
**消耗最大函数分析:**
<div class ='image-gallery'>
![alt text](image-10.png)
**不难发现性能主要消耗在request**
</div>
**改进思路:**
* 预计使用异步爬虫或多线程加上ip代理进行request的爬取速度优化和避免被ban
### 2.4 数据结论
**数据样例展示**
<div class ='image-gallery'>
![alt text](image.png)
</div>
**主要发现:**
1. LLM视频内学习打卡弹幕数量巨大
2. 少有真正讨论LLM的弹幕内容内容更偏向于口语化评价
@ -281,20 +296,7 @@ if __name__ == "__main__":
---
## 附录
### 核心代码片段
```python
# 主要函数实现
```
### 数据样例展示
<div class ='image-gallery'>
![alt text](image.png)
</div>
### 参考资源

Loading…
Cancel
Save