feat：code文件夹->项目经过Code Quality Analysis工具的分析并消除所有的警告的版本

developed_code文件夹->用 cProfile 找出代码中的性能瓶颈并进行改进的版本 output文件夹->输出的内容
2 months ago · 64d179c027
parent 5baca231b8
commit 64d179c027
32 changed files with 292457 additions and 0 deletions
--- a/.idea/.gitignore
+++ b/.idea/.gitignore
@ -0,0 +1,3 @@
+# Default ignored files
+/shelf/
+/workspace.xml
--- a/.idea/inspectionProfiles/Project_Default.xml
+++ b/.idea/inspectionProfiles/Project_Default.xml
@ -0,0 +1,5 @@
+<component name="InspectionProjectProfileManager">
+  <profile version="1.0">
+    <option name="myName" value="Project Default" />
+  </profile>
+</component>
--- a/.idea/vcs.xml
+++ b/.idea/vcs.xml
@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="" vcs="Git" />
+  </component>
+</project>
--- a/code/a_wordcloud.py
+++ b/code/a_wordcloud.py
@ -0,0 +1,49 @@
+"""
+生成基于弹幕数据的词云图
+"""
+
+import pandas as pd
+import numpy as np
+import wordcloud
+from matplotlib.image import imread
+import jieba
+
+def blue_color_func(_random_state=None, **_kwargs):
+    """
+    Generates a color in the HSL format with a random lightness value.
+
+    Parameters:
+    _random_state (None or int): Used to seed the random number generator.
+    **_kwargs: Additional arguments (ignored in this function).
+
+    Returns:
+    str: A string representing the color in HSL format.
+    """
+    return f"hsl(210, 100%, {np.random.randint(50, 90)}%)"
+
+def wordcloud_generation(danmu_data):
+    """生成词云图并保存"""
+    dm_list = danmu_data['danmu'].dropna().astype(str).tolist()
+    dm_string = ' '.join(dm_list)
+    dmreal_string = ' '.join(jieba.lcut(dm_string))
+    img = imread("E:/Crawler/output/OIP.jpg")
+    my_stopwords = {'我', '你', '他', '这', '个', '是', '的', '了', '啊', '吗', '吧', '就', '都', '不', '也'}
+    wc = wordcloud.WordCloud(
+        stopwords=my_stopwords,
+        width=1920,
+        height=1200,
+        background_color='white',
+        font_path='msyhl.ttc',
+        mask=img,
+        max_words=100,
+        color_func=blue_color_func,
+    ).generate(dmreal_string)
+    wc.to_file('E:/Crawler/output/danmu_dwordcloud.png')
+
+def main():
+    """加载数据并生成词云"""
+    dm = pd.read_excel('E:/Crawler/output/Top8_Danmu.xlsx', sheet_name='Sheet1')
+    wordcloud_generation(dm)
+
+if __name__ == '__main__':
+    main()
--- a/code/b_wordcloud.py
+++ b/code/b_wordcloud.py
@ -0,0 +1,48 @@
+"""
+生成基于全部弹幕数据的词云图，并进行关键词提取和归一化处理
+"""
+
+import re
+import pandas as pd
+import numpy as np
+import wordcloud
+from matplotlib.image import imread
+from jieba import analyse
+
+def blue_color_func(_, __, ___, ____, _random_state=None, **_kwargs):
+    """定义蓝色调色板，用于词云图的颜色设置"""
+    return f"hsl(210, 100%, {np.random.randint(50, 90)}%)"
+
+def normalize_hahaha(text):
+    """归一化处理，将所有类似的“哈哈哈”统一为“哈哈哈”"""
+    return re.sub(r'哈{3,}', '哈哈哈', text)
+
+def wordcloud_generation(danmu_data):
+    """生成词云图并保存"""
+    dm_list = danmu_data['danmu'].dropna().astype(str).tolist()
+    dm_list = [normalize_hahaha(text) for text in dm_list]
+    dm_string = ' '.join(dm_list)
+
+    keywords = analyse.extract_tags(dm_string, topK=100, withWeight=False, allowPOS=())
+    keywords = [word for word in keywords if word not in my_stopwords]
+    dmreal_string = ' '.join(keywords)
+
+    img = imread("E:/Crawler/output/OIP.jpg")
+
+    wc = wordcloud.WordCloud(
+        stopwords=my_stopwords,
+        width=1920,
+        height=1200,
+        background_color='white',
+        font_path='msyhl.ttc',
+        mask=img,
+        max_words=100,
+        color_func=blue_color_func,
+    ).generate(dmreal_string)
+    wc.to_file('E:/Crawler/output/alldanmu_dwordcloud.png')
+
+# 加载数据并生成词云
+dm = pd.read_excel('E:/Crawler/output/All_Danmu.xlsx', sheet_name='Sheet1')
+my_stopwords = {'我', '你', '他', '这', '个', '是', '的', '了', '啊', '吗',  '就', '都', '不是', '也', '哈哈哈',
+                '吧', '呀', '哦', '呢', '哇', '么', '嘛', '呵呵', '呵', '嘿嘿', '哎呀', '哎', '哼', '呃'}
+wordcloud_generation(dm)
--- a/code/bvid.py
+++ b/code/bvid.py
@ -0,0 +1,47 @@
+"""
+从B站搜索结果中提取视频的BV号，并将其保存到文件中
+"""
+
+import re
+import requests
+from common_headers import HEADERS  # 假设你有一个公共的header文件
+
+def get_source(page_num):
+    """获取B站搜索结果页的源码"""
+    get_url = (
+        f'https://api.bilibili.com/x/web-interface'
+        f'/wbi/search/type?__refresh__=true&_extra=&'
+        f'context=&page={page_num}'
+        '&page_size=42&from_source=&from_spmid=333.337&'
+        'platform=pc&highlight=1&single_column=0&'
+        'keyword=2024巴黎奥运会'
+        '&qv_id=zaOudcC1LJI0GehR81nuNQEKktKQ2aP1&ad_resource=5654'
+        '&source_tag=3&gaia_vtoken=&category_id=&search_type=video'
+    )
+    response = requests.get(url=get_url, headers=HEADERS, timeout=10)
+    return response.text
+
+def extract_bv(source_html):
+    """从搜索结果的HTML源码中提取BV号"""
+    return re.findall('"bvid":"(.*?)","title":".*?', source_html)
+
+def save_bv_to_file(bv_list):
+    """将BV号保存到文件中"""
+    with open('E:/Crawler/output/bv_numbers.txt', 'a', encoding='utf-8') as f:
+        for bv in bv_list:
+            f.write(bv + '\n')
+
+def main():
+    """主函数：循环获取多页BV号并保存"""
+    counter = 0
+    for page in range(1, 9):
+        html_source = get_source(page)
+        bvs = extract_bv(html_source)
+        save_bv_to_file(bvs)
+        counter += len(bvs)
+        if counter >= 300:
+            break
+    print("BV号收集完成")
+
+if __name__ == '__main__':
+    main()
--- a/code/common_headers.py
+++ b/code/common_headers.py
@ -0,0 +1,22 @@
+"""
+定义通用的HTTP请求头
+"""
+
+HEADERS = {
+    'authority': 'api.bilibili.com',
+    'accept': 'application/json, text/plain, */*',
+    'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
+    'cookie': '...',
+    'origin': 'https://www.bilibili.com',
+    'referer': 'https://space.bilibili.com/1760559884?spm_id_from=333.788.0.0',
+    'sec-ch-ua': '"Not)A;Brand";v="99", "Microsoft Edge";v="127", "Chromium";v="127"',
+    'sec-ch-ua-mobile': '?0',
+    'sec-ch-ua-platform': '"Windows"',
+    'sec-fetch-dest': 'document',
+    'sec-fetch-mode': 'navigate',
+    'sec-fetch-site': 'same-origin',
+    'upgrade-insecure-requests': '1',
+    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
+                  'AppleWebKit/537.36 (KHTML, like Gecko) '
+                  'Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0'
+}
--- a/code/performance_profile.prof
+++ b/code/performance_profile.prof
--- a/code/performance_report.txt
+++ b/code/performance_report.txt
--- a/code/profile_analysis.py
+++ b/code/profile_analysis.py
@ -0,0 +1,29 @@
+import cProfile
+import pstats
+import a_wordcloud
+import bvid
+import to_allexcel
+import to_danmu
+import to_excel
+
+def run_all():
+    a_wordcloud.main()
+    bvid.main()
+    to_allexcel.main()
+    to_danmu.main()
+    to_excel.main()
+
+if __name__ == '__main__':
+    profiler = cProfile.Profile()
+    profiler.enable()
+
+    run_all()
+
+    profiler.disable()
+    profiler.dump_stats('performance_profile.prof')
+
+    # 分析结果
+    with open('performance_report.txt', 'w') as f:
+        ps = pstats.Stats(profiler, stream=f)
+        ps.sort_stats('cumulative')
+        ps.print_stats()
--- a/code/to_allexcel.py
+++ b/code/to_allexcel.py
@ -0,0 +1,38 @@
+"""
+读取弹幕数据并统计频率，然后将统计结果保存到Excel文件中
+"""
+
+import pandas as pd
+
+def load_danmu(file_path):
+    """从文件中读取弹幕数据"""
+    with open(file_path, 'r', encoding='utf-8') as f:
+        return f.readlines()
+
+def count_danmu(danmu_list):
+    """统计每条弹幕出现的次数"""
+    all_danmus = {}
+    for danmu in danmu_list:
+        danmu = danmu.strip()
+        all_danmus[danmu] = all_danmus.get(danmu, 0) + 1
+    return all_danmus
+
+def save_to_excel(all_danmus, excel_file):
+    """将弹幕频率统计结果保存到Excel文件中"""
+    sorted_danmus = sorted(all_danmus.items(), key=lambda x: x[1], reverse=True)
+    df = pd.DataFrame(sorted_danmus, columns=['danmu', 'count'])
+    df.to_excel(excel_file, index=False)
+
+def main():
+    """读取弹幕数据、统计频率并保存到Excel"""
+    danmu_file_path = 'E:/Crawler/output/danmu.txt'
+    excel_file = 'E:/Crawler/output/All_Danmu.xlsx'
+
+    danmu_list = load_danmu(danmu_file_path)
+    all_danmus = count_danmu(danmu_list)
+    save_to_excel(all_danmus, excel_file)
+
+    print("所有弹幕数据统计完成，并已保存到Excel表格")
+
+if __name__ == '__main__':
+    main()
--- a/code/to_danmu.py
+++ b/code/to_danmu.py
@ -0,0 +1,48 @@
+"""
+从B站获取视频的弹幕并保存到文件中
+"""
+
+import re
+import json
+import requests
+from common_headers import HEADERS  # 假设你有一个公共的header文件
+
+def load_bv_numbers(file_path):
+    """从文件中读取BV号"""
+    with open(file_path, 'r', encoding='utf-8') as f:
+        return [line.strip() for line in f.readlines()]
+
+def fetch_video_cids(bv_list):
+    """获取视频的CID号"""
+    cid_list = []
+    for bv in bv_list:
+        url = f'https://api.bilibili.com/x/player/pagelist?bvid={bv}&jsonp=jsonp'
+        response = requests.get(url=url, headers=HEADERS, timeout=10)
+        cid = json.loads(response.text)['data'][0]['cid']
+        cid_list.append(cid)
+    return cid_list
+
+def fetch_and_save_danmu(cid_list, danmu_file):
+    """爬取视频弹幕并保存到文件"""
+    for cid in cid_list:
+        url = f'https://api.bilibili.com/x/v1/dm/list.so?oid={cid}'
+        response = requests.get(url=url, headers=HEADERS, timeout=10)
+        response.encoding = response.apparent_encoding
+        data_list = re.findall('<d p=".*?">(.*?)</d>', response.text)
+        with open(danmu_file, mode='a', encoding='utf-8') as f:
+            for danmu in data_list:
+                f.write(danmu + '\n')
+
+def main():
+    """主函数：从BV号中获取CID并爬取弹幕"""
+    bv_file_path = 'E:/Crawler/output/bv_numbers.txt'
+    danmu_output_file = 'E:/Crawler/output/danmu.txt'
+
+    bv_numbers = load_bv_numbers(bv_file_path)
+    cids = fetch_video_cids(bv_numbers)
+    fetch_and_save_danmu(cids, danmu_output_file)
+
+    print("弹幕数据爬取完成")
+
+if __name__ == '__main__':
+    main()
--- a/code/to_excel.py
+++ b/code/to_excel.py
@ -0,0 +1,41 @@
+"""
+统计AI相关的弹幕数据，并将前8项结果保存到Excel文件中
+"""
+
+import pandas as pd
+
+def load_danmu(file_path):
+    """从文件中读取弹幕数据"""
+    with open(file_path, 'r', encoding='utf-8') as f:
+        return f.readlines()
+
+def filter_and_count_danmu(danmu_list):
+    """统计AI相关的弹幕频率"""
+    all_danmus = {}
+    ai_keywords = ['ai', '智能', '技术', '应用', '人机', 'AI', '人工智能', '机器学习', '深度学习', '神经网络']
+
+    for danmu in danmu_list:
+        if any(keyword in danmu for keyword in ai_keywords):
+            danmu = danmu.strip()
+            all_danmus[danmu] = all_danmus.get(danmu, 0) + 1
+    return all_danmus
+
+def save_to_excel(all_danmus, excel_file):
+    """将统计的AI相关弹幕保存到Excel文件中"""
+    sorted_danmus = sorted(all_danmus.items(), key=lambda x: x[1], reverse=True)[:8]
+    df = pd.DataFrame(sorted_danmus, columns=['danmu', 'count'])
+    df.to_excel(excel_file, index=False)
+
+def main():
+    """读取弹幕数据、统计AI相关弹幕并保存到Excel"""
+    danmu_file_path = 'E:/Crawler/output/danmu.txt'
+    excel_file = 'E:/Crawler/output/Top8_Danmu.xlsx'
+
+    danmu_list = load_danmu(danmu_file_path)
+    all_danmus = filter_and_count_danmu(danmu_list)
+    save_to_excel(all_danmus, excel_file)
+
+    print("与AI相关的弹幕数据统计完成，并已保存到Excel表格")
+
+if __name__ == '__main__':
+    main()
--- a/Analysis工具的分析并消除所有的警告.md
+++ b/Analysis工具的分析并消除所有的警告.md
@ -0,0 +1,245 @@
+## 经过Code Quality Analysis工具的分析并消除所有的警告
+
+经过查询资料，选择了pylint进行分析：
+
+#### **Pylint**
+
+`Pylint`是一个非常流行的Python代码静态分析工具，可以帮助你检测代码中的错误、风格问题和复杂度问题。它会为你的代码打分，并列出所有警告和错误。你可以逐一修复这些问题，以提高代码质量。
+
+以下是一些改进的记录：
+
+(.venv) PS E:\Crawler> pylint a_wordcloud.py
+************* Module a_wordcloud
+a_wordcloud.py:38:0: C0304: Final newline missing (missing-final-newline)
+a_wordcloud.py:1:0: C0114: Missing module docstring (missing-module-docstring)
+a_wordcloud.py:8:0: C0116: Missing function or method docstring (missing-function-docstring)
+a_wordcloud.py:9:11: C0209: Formatting a regular string which could be an f-string (consider-using-f-string)
+a_wordcloud.py:8:20: W0613: Unused argument 'word' (unused-argument)
+a_wordcloud.py:8:26: W0613: Unused argument 'font_size' (unused-argument)
+a_wordcloud.py:8:37: W0613: Unused argument 'position' (unused-argument)
+a_wordcloud.py:8:47: W0613: Unused argument 'orientation' (unused-argument)
+a_wordcloud.py:8:60: W0613: Unused argument 'random_state' (unused-argument)
+a_wordcloud.py:8:0: W0613: Unused argument 'kwargs' (unused-argument)
+a_wordcloud.py:18:0: C0116: Missing function or method docstring (missing-function-docstring)
+a_wordcloud.py:18:25: W0621: Redefining name 'dm' from outer scope (line 13) (redefined-outer-name)
+
+-----------------------------------
+
+Your code has been rated at 2.94/10
+
+(.venv) PS E:\Crawler> pylint bvid.py
+************* Module bvid
+bvid.py:8:0: C0301: Line too long (1272/100) (line-too-long)
+bvid.py:18:0: C0301: Line too long (145/100) (line-too-long)
+bvid.py:27:0: C0301: Line too long (335/100) (line-too-long)
+bvid.py:65:0: C0304: Final newline missing (missing-final-newline)
+bvid.py:1:0: C0114: Missing module docstring (missing-module-docstring)
+bvid.py:22:0: C0103: Constant name "cid_num" doesn't conform to UPPER_CASE naming style (invalid-name)
+bvid.py:25:0: C0116: Missing function or method docstring (missing-function-docstring)
+bvid.py:25:0: C0103: Function name "Get_Source" doesn't conform to snake_case naming style (invalid-name)
+bvid.py:25:15: W0621: Redefining name 'page' from outer scope (line 53) (redefined-outer-name)
+bvid.py:31:4: W0621: Redefining name 'source' from outer scope (line 55) (redefined-outer-name)
+bvid.py:29:15: W3101: Missing timeout argument for method 'requests.get' can cause your program to hang indefinitely (missing-timeout)
+bvid.py:36:0: C0116: Missing function or method docstring (missing-function-docstring)
+bvid.py:36:0: C0103: Function name "Get_Bv" doesn't conform to snake_case naming style (invalid-name)
+bvid.py:36:11: W0621: Redefining name 'source' from outer scope (line 55) (redefined-outer-name)
+bvid.py:37:4: W0621: Redefining name 'url_list' from outer scope (line 57) (redefined-outer-name)
+bvid.py:37:15: R1734: Consider using [] instead of list() (use-list-literal)
+bvid.py:45:0: C0116: Missing function or method docstring (missing-function-docstring)
+bvid.py:45:0: C0103: Function name "Save_Bv" doesn't conform to snake_case naming style (invalid-name)
+bvid.py:45:12: W0621: Redefining name 'url_list' from outer scope (line 57) (redefined-outer-name)
+bvid.py:52:4: C0103: Constant name "bv_count" doesn't conform to UPPER_CASE naming style (invalid-name)
+
+-----------------------------------
+
+Your code has been rated at 3.55/10
+
+(.venv) PS E:\Crawler> pylint to_excel.py
+************* Module to_excel
+to_excel.py:1:0: C0114: Missing module docstring (missing-module-docstring)
+to_excel.py:4:0: C0116: Missing function or method docstring (missing-function-docstring)
+to_excel.py:10:0: C0116: Missing function or method docstring (missing-function-docstring)
+to_excel.py:24:0: C0116: Missing function or method docstring (missing-function-docstring)
+to_excel.py:30:0: C0116: Missing function or method docstring (missing-function-docstring)
+
+-----------------------------------
+
+Your code has been rated at 8.21/10
+
+
+
+中间修改了很多次，最后勉强从8.21->9.23
+
+
+
+(.venv) PS E:\Crawler> pylint E:\Crawler\code\a_wordcloud.py E:\Crawler\code\b_wordcloud.py E:\Crawler\code\bvid.py E:\Crawler\code\to_allexcel.py E:\Crawler\code\to_danmu.py E:\Crawler\code\to_excel.py
+************* Module bvid
+code\bvid.py:27:15: W0621: Redefining name 'page_number' from outer scope (line 52) (redefined-outer-name)
+code\bvid.py:40:11: W0621: Redefining name 'source_code' from outer scope (line 53) (redefined-outer-name)
+code\bvid.py:44:12: W0621: Redefining name 'bv_list' from outer scope (line 54) (redefined-outer-name)
+code\bvid.py:51:4: C0103: Constant name "bv_count" doesn't conform to UPPER_CASE naming style (invalid-name)
+************* Module to_danmu
+code\to_danmu.py:33:19: W0621: Redefining name 'bv_numbers' from outer scope (line 58) (redefined-outer-name)      
+code\to_danmu.py:35:4: W0621: Redefining name 'video_cids' from outer scope (line 59) (redefined-outer-name)       
+code\to_danmu.py:43:22: W0621: Redefining name 'output_file' from outer scope (line 56) (redefined-outer-name)
+code\to_danmu.py:55:4: C0103: Constant name "bv_file_path" doesn't conform to UPPER_CASE naming style (invalid-name)
+code\to_danmu.py:56:4: C0103: Constant name "output_file" doesn't conform to UPPER_CASE naming style (invalid-name)
+************* Module to_excel
+code\to_excel.py:1:0: R0801: Similar lines in 2 files
+==bvid:[7:28]
+==to_danmu:[8:29]
+HEADERS = {
+    'authority': 'api.bilibili.com',
+    'accept': 'application/json, text/plain, */*',
+    'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
+    'cookie': '...',
+    'origin': 'https://www.bilibili.com',
+    'referer': 'https://space.bilibili.com/1760559884?spm_id_from=333.788.0.0',
+    'sec-ch-ua': '"Not)A;Brand";v="99", "Microsoft Edge";v="127", "Chromium";v="127"',
+    'sec-ch-ua-mobile': '?0',
+    'sec-ch-ua-platform': '"Windows"',
+    'sec-fetch-dest': 'document',
+    'sec-fetch-mode': 'navigate',
+    'sec-fetch-site': 'same-origin',
+    'upgrade-insecure-requests': '1',
+    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
+                  'AppleWebKit/537.36 (KHTML, like Gecko) '
+                  'Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0'
+}
+
+------------------------------------------------------------------
+
+Your code has been rated at 9.23/10
+
+
+
+(.venv) PS E:\Crawler> pylint E:\Crawler\code\a_wordcloud.py E:\Crawler\code\bvid.py E:\Crawler\code\to_allexcel.py
+ E:\Crawler\code\to_danmu.py E:\Crawler\code\to_excel.py
+************* Module to_danmu
+code\to_danmu.py:38:4: C0103: Variable name "BV_FILE_PATH" doesn't conform to snake_case naming style (invalid-name)
+code\to_danmu.py:39:4: C0103: Variable name "DANMU_OUTPUT_FILE" doesn't conform to snake_case naming style (invalid-name)
+
+------------------------------------------------------------------
+
+Your code has been rated at 9.84/10 (previous run: 9.59/10, +0.24)
+
+这个时候是最兴奋的，觉得胜利在望
+
+-------------------------------------------------------------------
+
+Your code has been rated at 10.00/10 (previous run: 9.84/10, +0.16)
+
+再改了两次后成功了！
+
+
+
+---
+
+以下是关于提升过程的总结
+
+# 8.21->10
+
+## 一、关键问题与解决方法
+
+### **问题1：命名风格不符合PEP 8规范**
+
+**警告**：
+
+```python
+code\to_danmu.py:38:4: C0103: Variable name "BV_FILE_PATH" doesn't conform to snake_case naming style (invalid-name)
+```
+
+**解决方法**：
+将常量命名风格改为符合PEP 8规范的小写加下划线的`snake_case`，如将`BV_FILE_PATH`改为`bv_file_path`。
+
+**思考**：
+PEP 8建议变量名使用`snake_case`风格，常量名使用`UPPER_CASE`风格。遵循这些命名规范能让代码更具可读性和可维护性，也能让团队成员更容易理解和接手代码。
+
+---
+
+### **问题2：未使用的参数**
+
+**警告**：
+
+```python
+code\a_wordcloud.py:10:20: W0613: Unused argument 'word' (unused-argument)
+```
+
+**解决方法**：
+删除未使用的参数，或者将参数名改为`_`，以表明这个参数未被使用。
+
+**思考**：
+保留未使用的参数可能会使代码显得冗余和混乱，清理这些无用的部分不仅能减少误解，还能提高代码的简洁性和可维护性。
+
+---
+
+### **问题3：变量重定义**
+
+**警告**：
+
+```python
+code\bvid.py:9:15: W0621: Redefining name 'page_number' from outer scope (line 34) (redefined-outer-name)
+```
+
+**解决方法**：
+通过更改变量名称以避免在不同作用域中重定义变量，例如将`page_number`改为`page_num`。
+
+**思考**：
+在不同作用域中使用相同的变量名可能会导致代码逻辑混乱，特别是在复杂的函数或循环中。为每个作用域赋予独特的变量名，能避免潜在的逻辑错误和混淆。
+
+---
+
+### **问题4：重复代码**
+
+**警告**：
+
+```python
+code\to_excel.py:1:0: R0801: Similar lines in 2 files
+```
+
+**解决方法**：
+将重复的代码片段提取到一个独立的函数或模块中，然后在需要的地方调用它。例如，将重复使用的HTTP请求头封装到一个独立的模块中，并在所有相关文件中引用。
+
+**思考**：
+重复代码不仅增加了维护的复杂性，还可能导致不同部分的代码逻辑不一致。通过提取公共部分，能够使代码更加模块化、清晰且易于维护。
+
+---
+
+### **问题5：行长度超过限制**
+
+**警告**：
+
+```python
+code\a_wordcloud.py:26:0: C0301: Line too long (135/100) (line-too-long)
+```
+
+**解决方法**：
+将超长的代码行进行拆分，或者使用字符串格式化（如f-string）来将行长度控制在100字符以内。
+
+**思考**：
+控制代码行的长度有助于提高代码的可读性，特别是在显示器宽度有限或代码被打印出来时。保持代码简洁清晰，可以减少阅读和理解的时间。
+
+---
+
+### **问题6：未使用的导入**
+
+**警告**：
+
+```python
+code\b_wordcloud.py:8:0: C0411: standard import "re" should be placed before third party imports "pandas", "numpy", "jieba.analyse" (wrong-import-order)
+```
+
+**解决方法**：
+按PEP 8的建议，调整导入顺序，将标准库的导入放在文件的顶部，第三方库放在其后，并删除未使用的导入。
+
+**思考**：
+按规范组织导入可以提高代码的整洁性和可读性，还可以避免未使用导入对程序产生不必要的影响。保持代码的简洁和结构化有助于减少潜在的错误。
+
+##  总结
+
+- **命名规范**：遵循PEP 8的命名规范，能显著提高代码的可读性和一致性，尤其在多人协作时尤为重要。
+- **参数与变量**：清理未使用的参数和避免变量重定义，能防止不必要的混淆和错误。
+- **代码重复**：通过提取公共部分来减少代码重复，使代码更加模块化和易于维护。
+- **行长度控制**：控制行长度有助于提高代码的可读性，尤其是在显示器或打印输出有限的情况下。
+- **导入顺序**：按规范组织导入，提高代码的整洁性，减少潜在的冲突和错误。
+
--- a/developed_code/a_wordcloud.py
+++ b/developed_code/a_wordcloud.py
@ -0,0 +1,48 @@
+"""
+生成基于弹幕数据的词云图
+"""
+
+import pandas as pd
+import numpy as np
+import wordcloud
+from matplotlib.image import imread
+import jieba
+from concurrent.futures import ThreadPoolExecutor
+
+def blue_color_func(_random_state=None, **_kwargs):
+    """Generates a color in the HSL format with a random lightness value."""
+    return f"hsl(210, 100%, {np.random.randint(50, 90)}%)"
+
+def process_text(danmu_list):
+    """并行处理弹幕文本，进行分词等操作"""
+    with ThreadPoolExecutor() as executor:
+        dm_string = ' '.join(executor.map(lambda text: ' '.join(jieba.lcut(text)), danmu_list))
+    return dm_string
+
+def wordcloud_generation(danmu_data, stopwords, output_path):
+    """生成词云图并保存"""
+    dm_list = danmu_data['danmu'].dropna().astype(str).tolist()
+    dmreal_string = process_text(dm_list)
+    img = imread("/output/OIP.jpg")
+
+    wc = wordcloud.WordCloud(
+        stopwords=stopwords,
+        width=1920,
+        height=1200,
+        background_color='white',
+        font_path='msyhl.ttc',
+        mask=img,
+        max_words=100,
+        color_func=blue_color_func,
+    ).generate(dmreal_string)
+    wc.to_file(output_path)
+
+def main():
+    """加载数据并生成词云"""
+    dm = pd.read_excel('E:/Crawler/output/Top8_Danmu.xlsx', sheet_name='Sheet1')
+    stopwords = {'我', '你', '他', '这', '个', '是', '的', '了', '啊', '吗', '吧', '就', '都', '不', '也'}
+    wordcloud_generation(dm, stopwords, '/output/danmu_dwordcloud.png')
+    print("词云图生成完成！")
+
+if __name__ == '__main__':
+    main()
--- a/developed_code/b_wordcloud.py
+++ b/developed_code/b_wordcloud.py
@ -0,0 +1,55 @@
+"""
+生成基于全部弹幕数据的词云图，并进行关键词提取和归一化处理
+"""
+
+import re
+import pandas as pd
+import numpy as np
+import wordcloud
+from matplotlib.image import imread
+from jieba import analyse
+from concurrent.futures import ThreadPoolExecutor
+
+def blue_color_func(_, __, ___, ____, _random_state=None, **_kwargs):
+    """定义蓝色调色板，用于词云图的颜色设置"""
+    return f"hsl(210, 100%, {np.random.randint(50, 90)}%)"
+
+def normalize_hahaha(text):
+    """归一化处理，将所有类似的“哈哈哈”统一为“哈哈哈”"""
+    return re.sub(r'哈{3,}', '哈哈哈', text)
+
+def process_keywords(dm_list):
+    """并行处理关键词提取"""
+    dm_string = ' '.join(dm_list)
+    with ThreadPoolExecutor() as executor:
+        keywords = list(executor.map(lambda kw: analyse.extract_tags(kw, topK=100, withWeight=False, allowPOS=()), [dm_string]))
+    return ' '.join(keywords[0])
+
+def wordcloud_generation(danmu_data, stopwords, output_path):
+    """生成词云图并保存"""
+    dm_list = danmu_data['danmu'].dropna().astype(str).tolist()
+    dm_list = [normalize_hahaha(text) for text in dm_list]
+    dmreal_string = process_keywords(dm_list)
+
+    img = imread("/output/OIP.jpg")
+
+    wc = wordcloud.WordCloud(
+        stopwords=stopwords,
+        width=1920,
+        height=1200,
+        background_color='white',
+        font_path='msyhl.ttc',
+        mask=img,
+        max_words=100,
+        color_func=blue_color_func,
+    ).generate(dmreal_string)
+    wc.to_file(output_path)
+
+def main():
+    """加载数据并生成词云"""
+    dm = pd.read_excel('E:/Crawler/output/All_Danmu.xlsx', sheet_name='Sheet1')
+    stopwords = {'我', '你', '他', '这', '个', '是', '的', '了', '啊', '吗', '吧', '就', '都', '不', '也', '哈哈哈'}
+    wordcloud_generation(dm, stopwords, '/output/alldanmu_dwordcloud.png')
+
+if __name__ == '__main__':
+    main()
--- a/developed_code/bvid.py
+++ b/developed_code/bvid.py
@ -0,0 +1,54 @@
+"""
+从B站搜索结果中提取视频的BV号，并将其保存到文件中
+"""
+
+import re
+import requests
+from common_headers import HEADERS
+from concurrent.futures import ThreadPoolExecutor
+
+def get_source(page_num):
+    """获取B站搜索结果页的源码"""
+    get_url = (
+        f'https://api.bilibili.com/x/web-interface'
+        f'/wbi/search/type?__refresh__=true&_extra=&'
+        f'context=&page={page_num}&page_size=42&from_source=&from_spmid=333.337&'
+        'platform=pc&highlight=1&single_column=0&keyword=2024巴黎奥运会'
+        '&qv_id=zaOudcC1LJI0GehR81nuNQEKktKQ2aP1&ad_resource=5654'
+        '&source_tag=3&gaia_vtoken=&category_id=&search_type=video'
+    )
+    response = requests.get(url=get_url, headers=HEADERS, timeout=10)
+    return response.text
+
+def extract_bv(source_html):
+    """从搜索结果的HTML源码中提取BV号"""
+    return re.findall('"bvid":"(.*?)","title":".*?', source_html)
+
+def save_bv_to_file(bv_list):
+    """将BV号保存到文件中"""
+    with open('/output/bv_numbers.txt', 'a', encoding='utf-8') as f:
+        for bv in bv_list:
+            f.write(bv + '\n')
+
+def process_pages(page_range):
+    """并行处理多个页面"""
+    with ThreadPoolExecutor() as executor:
+        results = list(executor.map(get_source, page_range))
+    return results
+
+def main():
+    """主函数：循环获取多页BV号并保存"""
+    counter = 0
+    page_range = range(1, 9)
+    html_sources = process_pages(page_range)
+
+    for html_source in html_sources:
+        bvs = extract_bv(html_source)
+        save_bv_to_file(bvs)
+        counter += len(bvs)
+        if counter >= 300:
+            break
+    print("BV号收集完成")
+
+if __name__ == '__main__':
+    main()
--- a/developed_code/common_headers.py
+++ b/developed_code/common_headers.py
@ -0,0 +1,22 @@
+"""
+定义通用的HTTP请求头
+"""
+
+HEADERS = {
+    'authority': 'api.bilibili.com',
+    'accept': 'application/json, text/plain, */*',
+    'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
+    'cookie': '...',
+    'origin': 'https://www.bilibili.com',
+    'referer': 'https://space.bilibili.com/1760559884?spm_id_from=333.788.0.0',
+    'sec-ch-ua': '"Not)A;Brand";v="99", "Microsoft Edge";v="127", "Chromium";v="127"',
+    'sec-ch-ua-mobile': '?0',
+    'sec-ch-ua-platform': '"Windows"',
+    'sec-fetch-dest': 'document',
+    'sec-fetch-mode': 'navigate',
+    'sec-fetch-site': 'same-origin',
+    'upgrade-insecure-requests': '1',
+    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
+                  'AppleWebKit/537.36 (KHTML, like Gecko) '
+                  'Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0'
+}
--- a/developed_code/performance_profile.prof
+++ b/developed_code/performance_profile.prof
--- a/developed_code/performance_report.txt
+++ b/developed_code/performance_report.txt
--- a/developed_code/profile_analysis.py
+++ b/developed_code/profile_analysis.py
@ -0,0 +1,29 @@
+import cProfile
+import pstats
+import a_wordcloud
+import bvid
+import to_allexcel
+import to_danmu
+import to_excel
+
+def run_all():
+    bvid.main()
+    to_danmu.main()
+    to_allexcel.main()
+    to_excel.main()
+    a_wordcloud.main()
+
+if __name__ == '__main__':
+    profiler = cProfile.Profile()
+    profiler.enable()
+
+    run_all()
+
+    profiler.disable()
+    profiler.dump_stats('performance_profile.prof')
+
+    # 分析结果
+    with open('performance_report.txt', 'w') as f:
+        ps = pstats.Stats(profiler, stream=f)
+        ps.sort_stats('cumulative')
+        ps.print_stats()
--- a/developed_code/to_allexcel.py
+++ b/developed_code/to_allexcel.py
@ -0,0 +1,77 @@
+"""
+读取弹幕数据并统计频率，然后将统计结果保存到Excel文件中
+"""
+
+import pandas as pd
+
+def load_danmu(file_path):
+    """从文件中读取弹幕数据"""
+    with open(file_path, 'r', encoding='utf-8') as f:
+        return f.readlines()
+"""
+读取弹幕数据并统计频率，然后将统计结果保存到Excel文件中
+"""
+
+import pandas as pd
+from concurrent.futures import ThreadPoolExecutor
+
+def load_danmu(file_path):
+    """从文件中读取弹幕数据"""
+    with open(file_path, 'r', encoding='utf-8') as f:
+        return f.readlines()
+
+def count_danmu(danmu_list):
+    """统计每条弹幕出现的次数"""
+    all_danmus = {}
+    with ThreadPoolExecutor() as executor:
+        for danmu in executor.map(str.strip, danmu_list):
+            all_danmus[danmu] = all_danmus.get(danmu, 0) + 1
+    return all_danmus
+
+def save_to_excel(all_danmus, excel_file):
+    """将弹幕频率统计结果保存到Excel文件中"""
+    sorted_danmus = sorted(all_danmus.items(), key=lambda x: x[1], reverse=True)
+    df = pd.DataFrame(sorted_danmus, columns=['danmu', 'count'])
+    df.to_excel(excel_file, index=False)
+
+def main():
+    """读取弹幕数据、统计频率并保存到Excel"""
+    danmu_file_path = '/output/danmu.txt'
+    excel_file = '/output/All_Danmu.xlsx'
+
+    danmu_list = load_danmu(danmu_file_path)
+    all_danmus = count_danmu(danmu_list)
+    save_to_excel(all_danmus, excel_file)
+
+    print("所有弹幕数据统计完成，并已保存到Excel表格")
+
+if __name__ == '__main__':
+    main()
+
+def count_danmu(danmu_list):
+    """统计每条弹幕出现的次数"""
+    all_danmus = {}
+    for danmu in danmu_list:
+        danmu = danmu.strip()
+        all_danmus[danmu] = all_danmus.get(danmu, 0) + 1
+    return all_danmus
+
+def save_to_excel(all_danmus, excel_file):
+    """将弹幕频率统计结果保存到Excel文件中"""
+    sorted_danmus = sorted(all_danmus.items(), key=lambda x: x[1], reverse=True)
+    df = pd.DataFrame(sorted_danmus, columns=['danmu', 'count'])
+    df.to_excel(excel_file, index=False)
+
+def main():
+    """读取弹幕数据、统计频率并保存到Excel"""
+    danmu_file_path = '/output/danmu.txt'
+    excel_file = '/output/All_Danmu.xlsx'
+
+    danmu_list = load_danmu(danmu_file_path)
+    all_danmus = count_danmu(danmu_list)
+    save_to_excel(all_danmus, excel_file)
+
+    print("所有弹幕数据统计完成，并已保存到Excel表格")
+
+if __name__ == '__main__':
+    main()
--- a/developed_code/to_danmu.py
+++ b/developed_code/to_danmu.py
@ -0,0 +1,49 @@
+"""
+从B站获取视频的弹幕并保存到文件中
+"""
+
+import re
+import json
+import requests
+from common_headers import HEADERS
+from concurrent.futures import ThreadPoolExecutor
+
+def load_bv_numbers(file_path):
+    """从文件中读取BV号"""
+    with open(file_path, 'r', encoding='utf-8') as f:
+        return [line.strip() for line in f.readlines()]
+
+def fetch_video_cids(bv_list):
+    """获取视频的CID号"""
+    cid_list = []
+    with ThreadPoolExecutor() as executor:
+        results = list(executor.map(lambda bv: requests.get(f'https://api.bilibili.com/x/player/pagelist?bvid={bv}&jsonp=jsonp', headers=HEADERS, timeout=10), bv_list))
+    for response in results:
+        cid = json.loads(response.text)['data'][0]['cid']
+        cid_list.append(cid)
+    return cid_list
+
+def fetch_and_save_danmu(cid_list, danmu_file):
+    """爬取视频弹幕并保存到文件"""
+    with ThreadPoolExecutor() as executor:
+        results = list(executor.map(lambda cid: requests.get(f'https://api.bilibili.com/x/v1/dm/list.so?oid={cid}', headers=HEADERS, timeout=10), cid_list))
+    for response in results:
+        response.encoding = response.apparent_encoding
+        data_list = re.findall('<d p=".*?">(.*?)</d>', response.text)
+        with open(danmu_file, mode='a', encoding='utf-8') as f:
+            for danmu in data_list:
+                f.write(danmu + '\n')
+
+def main():
+    """主函数：从BV号中获取CID并爬取弹幕"""
+    bv_file_path = '/output/bv_numbers.txt'
+    danmu_output_file = '/output/danmu.txt'
+
+    bv_numbers = load_bv_numbers(bv_file_path)
+    cids = fetch_video_cids(bv_numbers)
+    fetch_and_save_danmu(cids, danmu_output_file)
+
+    print("弹幕数据爬取完成")
+
+if __name__ == '__main__':
+    main()
--- a/developed_code/to_excel.py
+++ b/developed_code/to_excel.py
@ -0,0 +1,41 @@
+"""
+统计AI相关的弹幕数据，并将前8项结果保存到Excel文件中
+"""
+
+import pandas as pd
+from concurrent.futures import ThreadPoolExecutor
+
+def load_danmu(file_path):
+    """从文件中读取弹幕数据"""
+    with open(file_path, 'r', encoding='utf-8') as f:
+        return f.readlines()
+
+def filter_and_count_danmu(danmu_list):
+    """统计AI相关的弹幕频率"""
+    all_danmus = {}
+    ai_keywords = ['ai', '智能', '技术', '应用', '人机', 'AI', '人工智能', '机器学习', '深度学习', '神经网络']
+    with ThreadPoolExecutor() as executor:
+        for danmu in executor.map(str.strip, danmu_list):
+            if any(keyword in danmu for keyword in ai_keywords):
+                all_danmus[danmu] = all_danmus.get(danmu, 0) + 1
+    return all_danmus
+
+def save_to_excel(all_danmus, excel_file):
+    """将统计的AI相关弹幕保存到Excel文件中"""
+    sorted_danmus = sorted(all_danmus.items(), key=lambda x: x[1], reverse=True)[:8]
+    df = pd.DataFrame(sorted_danmus, columns=['danmu', 'count'])
+    df.to_excel(excel_file, index=False)
+
+def main():
+    """读取弹幕数据、统计AI相关弹幕并保存到Excel"""
+    danmu_file_path = '/output/danmu.txt'
+    excel_file = '/output/Top8_Danmu.xlsx'
+
+    danmu_list = load_danmu(danmu_file_path)
+    all_danmus = filter_and_count_danmu(danmu_list)
+    save_to_excel(all_danmus, excel_file)
+
+    print("与AI相关的弹幕数据统计完成，并已保存到Excel表格")
+
+if __name__ == '__main__':
+    main()
--- a/output/All_Danmu.xlsx
+++ b/output/All_Danmu.xlsx
--- a/output/OIP.jpg
+++ b/output/OIP.jpg
--- a/output/Top8_Danmu.xlsx
+++ b/output/Top8_Danmu.xlsx
--- a/output/alldanmu_dwordcloud.png
+++ b/output/alldanmu_dwordcloud.png
--- a/output/bv_numbers.txt
+++ b/output/bv_numbers.txt
--- a/output/danmu.txt
+++ b/output/danmu.txt
--- a/output/danmu_dwordcloud.png
+++ b/output/danmu_dwordcloud.png
--- a/output/requirements.txt
+++ b/output/requirements.txt