complete analasy work effience by the cProfile

3 months ago · 1e2aef5a4f
parent 93498468d7
commit 1e2aef5a4f
13 changed files with 93 additions and 5062 deletions
--- a/danmu_statistics.xlsx
+++ b/danmu_statistics.xlsx
--- a/image-10.png
+++ b/image-10.png
--- a/main.py
+++ b/main.py
@ -4,17 +4,36 @@ import tool.keywords as keywords
 import tool.csv_parse as cp
 from tool.static import analyze_danmu_statistics
 from tool.cloud_show import Cloud_shower
-
+import cProfile
+import pstats
+from io import StringIO
 if __name__ == "__main__":
+    # 爬虫性能分析部分
+    scawler = Scawler()
+    profiler = cProfile.Profile()
+    profiler.enable()
+    ls = scawler.work(1, 4)
+    danmu_counter = analyze_danmu_statistics(ls)
+    shower = Cloud_shower()
+    shower.to_show(danmu_counter)
+
+    profiler.disable()
+    stream = StringIO()
+    stats = pstats.Stats(profiler, stream=stream)
+    stats.sort_stats('cumulative')  # 按累计时间排序
+    stats.print_stats(20)  # 显示前20行
+    print("\n=== 详细分析报告 ===")
+    print(stream.getvalue())
+    #数据爬取部分
    # scawler = Scawler()
    # ls = scawler.work(1, 40)
    # with open('./raw_danmu.txt', 'w', encoding='utf-8') as f:
    #     for danmu in ls:
    #         f.write(danmu + '\n')
-    ls = []
-    with open('./raw_danmu.txt', 'r', encoding='utf-8') as f:
-        ls = [line.strip() for line in f.readlines() if line.strip()]
-    danmu_counter = analyze_danmu_statistics(ls)
-    shower = Cloud_shower()
-    shower.to_show(danmu_counter)
-    print("词云生成完毕，保存为 ai_danmu_stylecloud.png")
+    # 数据处理部分
+    # ls = []
+    # with open('./raw_danmu.txt', 'r', encoding='utf-8') as f:
+    #     ls = [line.strip() for line in f.readlines() if line.strip()]
+    # danmu_counter = analyze_danmu_statistics(ls)
+    # shower = Cloud_shower()
+    # shower.to_show(danmu_counter)
--- a/BIN
+++ b/BIN
--- a/raw_danmu.txt
+++ b/raw_danmu.txt
--- a/test.py
+++ b/test.py
@ -0,0 +1,45 @@
+import cProfile
+import pstats
+from io import StringIO
+
+
+def complex_calculation(n):
+    """模拟复杂计算"""
+    result = 0
+    for i in range(n):
+        for j in range(i):
+            result += i * j
+    return result
+
+
+def data_processing():
+    """数据处理函数"""
+    data = [complex_calculation(i) for i in range(100, 200)]
+    return sum(data) / len(data)
+
+
+def analyze_with_cprofile():
+    # 方法1: 使用cProfile.run()
+
+    profiler = cProfile.Profile()
+    profiler.enable()
+
+    # 执行要分析的代码
+    result = data_processing()
+
+    profiler.disable()
+
+    # 生成统计报告
+    stream = StringIO()
+    stats = pstats.Stats(profiler, stream=stream)
+    stats.sort_stats('cumulative')  # 按累计时间排序
+    stats.print_stats(20)  # 显示前20行
+
+    print("\n=== 详细分析报告 ===")
+    print(stream.getvalue())
+
+    return result
+
+
+# 运行分析
+analyze_with_cprofile()
--- a/tool/pycache/cloud_show.cpython-311.pyc
+++ b/tool/pycache/cloud_show.cpython-311.pyc
--- a/tool/pycache/scrawl.cpython-311.pyc
+++ b/tool/pycache/scrawl.cpython-311.pyc
--- a/tool/pycache/static.cpython-311.pyc
+++ b/tool/pycache/static.cpython-311.pyc
--- a/tool/cloud_show.py
+++ b/tool/cloud_show.py
@ -35,19 +35,5 @@ if __name__ == "__main__":
    print(data_map)

    shower = Cloud_shower()
-    sample_danmu_data = [
-        "AI技术真厉害",
-        "大模型应用广泛",
-        "深度学习",
-        "神经网络",
-        "机器学习",
-        "AI技术真厉害",
-        "自然语言处理",
-        "计算机视觉",
-        "大模型应用广泛",
-        "强化学习",
-        "AI技术真厉害",
-        "生成式AI",
-    ]
    shower.to_show(data_map)
    print("词云生成完毕，保存为 ai_danmu_stylecloud.png")
--- a/tool/scrawl.py
+++ b/tool/scrawl.py
@ -65,6 +65,7 @@ class Scawler:
        danmaku_list = get_parse_list(text)
        return danmaku_list

+    #  获取搜索结果页面的HTML内容
    def get_html(self, page):
        page -= 1
        url_base = self.url_page_base.format(page=page, offset=page * 30)
@ -72,6 +73,7 @@ class Scawler:
        response.encoding = 'utf-8'
        return response.text

+    #  解析HTML内容，提取视频链接并获取弹幕
    def parse_html(self, html, num):
        soup = bs4.BeautifulSoup(html, 'html.parser')
        danmaku_list = []
--- a/tool/static.py
+++ b/tool/static.py
@ -42,9 +42,7 @@ def analyze_danmu_statistics(danmu_list, top_n=8, output_file='danmu_statistics.
        return None


-# 使用示例
 if __name__ == "__main__":
-    # 示例数据 - 这里替换为您实际的弹幕数据
    sample_danmu_data = [
        "AI技术真厉害", "大模型应用广泛", "深度学习", "神经网络", "机器学习", "AI技术真厉害", "自然语言处理", "计算机视觉", "大模型应用广泛", "强化学习", "AI技术真厉害",
        "生成式AI", "深度学习", "大模型应用广泛", "Transformer", "AI技术真厉害"
--- a/work.md
+++ b/work.md
@ -173,10 +173,6 @@ if __name__ == "__main__":

 ```

-**改进思路：**
-
-使用异步爬虫
-
 **性能分析图：**
 <div class ='image-gallery'>

@ -186,9 +182,28 @@ if __name__ == "__main__":


 **消耗最大函数分析：**
+<div class ='image-gallery'>
+
+![alt text](image-10.png)
+**不难发现性能主要消耗在request**
+
+</div>
+
+**改进思路：**
+* 预计使用异步爬虫或多线程加上ip代理进行request的爬取速度优化和避免被ban
+
+

 ### 2.4 数据结论

+**数据样例展示** 
+
+<div class ='image-gallery'>
+
+![alt text](image.png)
+
+</div>
+
 **主要发现：**
 1. LLM视频内学习打卡弹幕数量巨大
 2. 少有真正讨论LLM的弹幕内容，内容更偏向于口语化评价
@ -281,20 +296,7 @@ if __name__ == "__main__":

 ---

-## 附录
-
-### 核心代码片段
-```python
-# 主要函数实现
-```

-### 数据样例展示
-
-<div class ='image-gallery'>
-
-![alt text](image.png)
-
-</div>


 ### 参考资源