feat:code文件夹->项目经过Code Quality Analysis工具的分析并消除所有的警告的版本

developed_code文件夹->用 cProfile 找出代码中的性能瓶颈并进行改进的版本
output文件夹->输出的内容
main
poppoppuppylove 2 months ago
parent 5baca231b8
commit 64d179c027

3
.idea/.gitignore vendored

@ -0,0 +1,3 @@
# Default ignored files
/shelf/
/workspace.xml

@ -0,0 +1,5 @@
<component name="InspectionProjectProfileManager">
<profile version="1.0">
<option name="myName" value="Project Default" />
</profile>
</component>

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="" vcs="Git" />
</component>
</project>

@ -0,0 +1,49 @@
"""
生成基于弹幕数据的词云图
"""
import pandas as pd
import numpy as np
import wordcloud
from matplotlib.image import imread
import jieba
def blue_color_func(_random_state=None, **_kwargs):
"""
Generates a color in the HSL format with a random lightness value.
Parameters:
_random_state (None or int): Used to seed the random number generator.
**_kwargs: Additional arguments (ignored in this function).
Returns:
str: A string representing the color in HSL format.
"""
return f"hsl(210, 100%, {np.random.randint(50, 90)}%)"
def wordcloud_generation(danmu_data):
"""生成词云图并保存"""
dm_list = danmu_data['danmu'].dropna().astype(str).tolist()
dm_string = ' '.join(dm_list)
dmreal_string = ' '.join(jieba.lcut(dm_string))
img = imread("E:/Crawler/output/OIP.jpg")
my_stopwords = {'', '', '', '', '', '', '', '', '', '', '', '', '', '', ''}
wc = wordcloud.WordCloud(
stopwords=my_stopwords,
width=1920,
height=1200,
background_color='white',
font_path='msyhl.ttc',
mask=img,
max_words=100,
color_func=blue_color_func,
).generate(dmreal_string)
wc.to_file('E:/Crawler/output/danmu_dwordcloud.png')
def main():
"""加载数据并生成词云"""
dm = pd.read_excel('E:/Crawler/output/Top8_Danmu.xlsx', sheet_name='Sheet1')
wordcloud_generation(dm)
if __name__ == '__main__':
main()

@ -0,0 +1,48 @@
"""
生成基于全部弹幕数据的词云图并进行关键词提取和归一化处理
"""
import re
import pandas as pd
import numpy as np
import wordcloud
from matplotlib.image import imread
from jieba import analyse
def blue_color_func(_, __, ___, ____, _random_state=None, **_kwargs):
"""定义蓝色调色板,用于词云图的颜色设置"""
return f"hsl(210, 100%, {np.random.randint(50, 90)}%)"
def normalize_hahaha(text):
"""归一化处理,将所有类似的“哈哈哈”统一为“哈哈哈”"""
return re.sub(r'{3,}', '哈哈哈', text)
def wordcloud_generation(danmu_data):
"""生成词云图并保存"""
dm_list = danmu_data['danmu'].dropna().astype(str).tolist()
dm_list = [normalize_hahaha(text) for text in dm_list]
dm_string = ' '.join(dm_list)
keywords = analyse.extract_tags(dm_string, topK=100, withWeight=False, allowPOS=())
keywords = [word for word in keywords if word not in my_stopwords]
dmreal_string = ' '.join(keywords)
img = imread("E:/Crawler/output/OIP.jpg")
wc = wordcloud.WordCloud(
stopwords=my_stopwords,
width=1920,
height=1200,
background_color='white',
font_path='msyhl.ttc',
mask=img,
max_words=100,
color_func=blue_color_func,
).generate(dmreal_string)
wc.to_file('E:/Crawler/output/alldanmu_dwordcloud.png')
# 加载数据并生成词云
dm = pd.read_excel('E:/Crawler/output/All_Danmu.xlsx', sheet_name='Sheet1')
my_stopwords = {'', '', '', '', '', '', '', '', '', '', '', '', '不是', '', '哈哈哈',
'', '', '', '', '', '', '', '呵呵', '', '嘿嘿', '哎呀', '', '', ''}
wordcloud_generation(dm)

@ -0,0 +1,47 @@
"""
从B站搜索结果中提取视频的BV号并将其保存到文件中
"""
import re
import requests
from common_headers import HEADERS # 假设你有一个公共的header文件
def get_source(page_num):
"""获取B站搜索结果页的源码"""
get_url = (
f'https://api.bilibili.com/x/web-interface'
f'/wbi/search/type?__refresh__=true&_extra=&'
f'context=&page={page_num}'
'&page_size=42&from_source=&from_spmid=333.337&'
'platform=pc&highlight=1&single_column=0&'
'keyword=2024巴黎奥运会'
'&qv_id=zaOudcC1LJI0GehR81nuNQEKktKQ2aP1&ad_resource=5654'
'&source_tag=3&gaia_vtoken=&category_id=&search_type=video'
)
response = requests.get(url=get_url, headers=HEADERS, timeout=10)
return response.text
def extract_bv(source_html):
"""从搜索结果的HTML源码中提取BV号"""
return re.findall('"bvid":"(.*?)","title":".*?', source_html)
def save_bv_to_file(bv_list):
"""将BV号保存到文件中"""
with open('E:/Crawler/output/bv_numbers.txt', 'a', encoding='utf-8') as f:
for bv in bv_list:
f.write(bv + '\n')
def main():
"""主函数循环获取多页BV号并保存"""
counter = 0
for page in range(1, 9):
html_source = get_source(page)
bvs = extract_bv(html_source)
save_bv_to_file(bvs)
counter += len(bvs)
if counter >= 300:
break
print("BV号收集完成")
if __name__ == '__main__':
main()

@ -0,0 +1,22 @@
"""
定义通用的HTTP请求头
"""
HEADERS = {
'authority': 'api.bilibili.com',
'accept': 'application/json, text/plain, */*',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'cookie': '...',
'origin': 'https://www.bilibili.com',
'referer': 'https://space.bilibili.com/1760559884?spm_id_from=333.788.0.0',
'sec-ch-ua': '"Not)A;Brand";v="99", "Microsoft Edge";v="127", "Chromium";v="127"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-origin',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0'
}

Binary file not shown.

File diff suppressed because it is too large Load Diff

@ -0,0 +1,29 @@
import cProfile
import pstats
import a_wordcloud
import bvid
import to_allexcel
import to_danmu
import to_excel
def run_all():
a_wordcloud.main()
bvid.main()
to_allexcel.main()
to_danmu.main()
to_excel.main()
if __name__ == '__main__':
profiler = cProfile.Profile()
profiler.enable()
run_all()
profiler.disable()
profiler.dump_stats('performance_profile.prof')
# 分析结果
with open('performance_report.txt', 'w') as f:
ps = pstats.Stats(profiler, stream=f)
ps.sort_stats('cumulative')
ps.print_stats()

@ -0,0 +1,38 @@
"""
读取弹幕数据并统计频率然后将统计结果保存到Excel文件中
"""
import pandas as pd
def load_danmu(file_path):
"""从文件中读取弹幕数据"""
with open(file_path, 'r', encoding='utf-8') as f:
return f.readlines()
def count_danmu(danmu_list):
"""统计每条弹幕出现的次数"""
all_danmus = {}
for danmu in danmu_list:
danmu = danmu.strip()
all_danmus[danmu] = all_danmus.get(danmu, 0) + 1
return all_danmus
def save_to_excel(all_danmus, excel_file):
"""将弹幕频率统计结果保存到Excel文件中"""
sorted_danmus = sorted(all_danmus.items(), key=lambda x: x[1], reverse=True)
df = pd.DataFrame(sorted_danmus, columns=['danmu', 'count'])
df.to_excel(excel_file, index=False)
def main():
"""读取弹幕数据、统计频率并保存到Excel"""
danmu_file_path = 'E:/Crawler/output/danmu.txt'
excel_file = 'E:/Crawler/output/All_Danmu.xlsx'
danmu_list = load_danmu(danmu_file_path)
all_danmus = count_danmu(danmu_list)
save_to_excel(all_danmus, excel_file)
print("所有弹幕数据统计完成并已保存到Excel表格")
if __name__ == '__main__':
main()

@ -0,0 +1,48 @@
"""
从B站获取视频的弹幕并保存到文件中
"""
import re
import json
import requests
from common_headers import HEADERS # 假设你有一个公共的header文件
def load_bv_numbers(file_path):
"""从文件中读取BV号"""
with open(file_path, 'r', encoding='utf-8') as f:
return [line.strip() for line in f.readlines()]
def fetch_video_cids(bv_list):
"""获取视频的CID号"""
cid_list = []
for bv in bv_list:
url = f'https://api.bilibili.com/x/player/pagelist?bvid={bv}&jsonp=jsonp'
response = requests.get(url=url, headers=HEADERS, timeout=10)
cid = json.loads(response.text)['data'][0]['cid']
cid_list.append(cid)
return cid_list
def fetch_and_save_danmu(cid_list, danmu_file):
"""爬取视频弹幕并保存到文件"""
for cid in cid_list:
url = f'https://api.bilibili.com/x/v1/dm/list.so?oid={cid}'
response = requests.get(url=url, headers=HEADERS, timeout=10)
response.encoding = response.apparent_encoding
data_list = re.findall('<d p=".*?">(.*?)</d>', response.text)
with open(danmu_file, mode='a', encoding='utf-8') as f:
for danmu in data_list:
f.write(danmu + '\n')
def main():
"""主函数从BV号中获取CID并爬取弹幕"""
bv_file_path = 'E:/Crawler/output/bv_numbers.txt'
danmu_output_file = 'E:/Crawler/output/danmu.txt'
bv_numbers = load_bv_numbers(bv_file_path)
cids = fetch_video_cids(bv_numbers)
fetch_and_save_danmu(cids, danmu_output_file)
print("弹幕数据爬取完成")
if __name__ == '__main__':
main()

@ -0,0 +1,41 @@
"""
统计AI相关的弹幕数据并将前8项结果保存到Excel文件中
"""
import pandas as pd
def load_danmu(file_path):
"""从文件中读取弹幕数据"""
with open(file_path, 'r', encoding='utf-8') as f:
return f.readlines()
def filter_and_count_danmu(danmu_list):
"""统计AI相关的弹幕频率"""
all_danmus = {}
ai_keywords = ['ai', '智能', '技术', '应用', '人机', 'AI', '人工智能', '机器学习', '深度学习', '神经网络']
for danmu in danmu_list:
if any(keyword in danmu for keyword in ai_keywords):
danmu = danmu.strip()
all_danmus[danmu] = all_danmus.get(danmu, 0) + 1
return all_danmus
def save_to_excel(all_danmus, excel_file):
"""将统计的AI相关弹幕保存到Excel文件中"""
sorted_danmus = sorted(all_danmus.items(), key=lambda x: x[1], reverse=True)[:8]
df = pd.DataFrame(sorted_danmus, columns=['danmu', 'count'])
df.to_excel(excel_file, index=False)
def main():
"""读取弹幕数据、统计AI相关弹幕并保存到Excel"""
danmu_file_path = 'E:/Crawler/output/danmu.txt'
excel_file = 'E:/Crawler/output/Top8_Danmu.xlsx'
danmu_list = load_danmu(danmu_file_path)
all_danmus = filter_and_count_danmu(danmu_list)
save_to_excel(all_danmus, excel_file)
print("与AI相关的弹幕数据统计完成并已保存到Excel表格")
if __name__ == '__main__':
main()

@ -0,0 +1,245 @@
## 经过Code Quality Analysis工具的分析并消除所有的警告
经过查询资料选择了pylint进行分析
#### **Pylint**
`Pylint`是一个非常流行的Python代码静态分析工具可以帮助你检测代码中的错误、风格问题和复杂度问题。它会为你的代码打分并列出所有警告和错误。你可以逐一修复这些问题以提高代码质量。
以下是一些改进的记录:
(.venv) PS E:\Crawler> pylint a_wordcloud.py
************* Module a_wordcloud
a_wordcloud.py:38:0: C0304: Final newline missing (missing-final-newline)
a_wordcloud.py:1:0: C0114: Missing module docstring (missing-module-docstring)
a_wordcloud.py:8:0: C0116: Missing function or method docstring (missing-function-docstring)
a_wordcloud.py:9:11: C0209: Formatting a regular string which could be an f-string (consider-using-f-string)
a_wordcloud.py:8:20: W0613: Unused argument 'word' (unused-argument)
a_wordcloud.py:8:26: W0613: Unused argument 'font_size' (unused-argument)
a_wordcloud.py:8:37: W0613: Unused argument 'position' (unused-argument)
a_wordcloud.py:8:47: W0613: Unused argument 'orientation' (unused-argument)
a_wordcloud.py:8:60: W0613: Unused argument 'random_state' (unused-argument)
a_wordcloud.py:8:0: W0613: Unused argument 'kwargs' (unused-argument)
a_wordcloud.py:18:0: C0116: Missing function or method docstring (missing-function-docstring)
a_wordcloud.py:18:25: W0621: Redefining name 'dm' from outer scope (line 13) (redefined-outer-name)
-----------------------------------
Your code has been rated at 2.94/10
(.venv) PS E:\Crawler> pylint bvid.py
************* Module bvid
bvid.py:8:0: C0301: Line too long (1272/100) (line-too-long)
bvid.py:18:0: C0301: Line too long (145/100) (line-too-long)
bvid.py:27:0: C0301: Line too long (335/100) (line-too-long)
bvid.py:65:0: C0304: Final newline missing (missing-final-newline)
bvid.py:1:0: C0114: Missing module docstring (missing-module-docstring)
bvid.py:22:0: C0103: Constant name "cid_num" doesn't conform to UPPER_CASE naming style (invalid-name)
bvid.py:25:0: C0116: Missing function or method docstring (missing-function-docstring)
bvid.py:25:0: C0103: Function name "Get_Source" doesn't conform to snake_case naming style (invalid-name)
bvid.py:25:15: W0621: Redefining name 'page' from outer scope (line 53) (redefined-outer-name)
bvid.py:31:4: W0621: Redefining name 'source' from outer scope (line 55) (redefined-outer-name)
bvid.py:29:15: W3101: Missing timeout argument for method 'requests.get' can cause your program to hang indefinitely (missing-timeout)
bvid.py:36:0: C0116: Missing function or method docstring (missing-function-docstring)
bvid.py:36:0: C0103: Function name "Get_Bv" doesn't conform to snake_case naming style (invalid-name)
bvid.py:36:11: W0621: Redefining name 'source' from outer scope (line 55) (redefined-outer-name)
bvid.py:37:4: W0621: Redefining name 'url_list' from outer scope (line 57) (redefined-outer-name)
bvid.py:37:15: R1734: Consider using [] instead of list() (use-list-literal)
bvid.py:45:0: C0116: Missing function or method docstring (missing-function-docstring)
bvid.py:45:0: C0103: Function name "Save_Bv" doesn't conform to snake_case naming style (invalid-name)
bvid.py:45:12: W0621: Redefining name 'url_list' from outer scope (line 57) (redefined-outer-name)
bvid.py:52:4: C0103: Constant name "bv_count" doesn't conform to UPPER_CASE naming style (invalid-name)
-----------------------------------
Your code has been rated at 3.55/10
(.venv) PS E:\Crawler> pylint to_excel.py
************* Module to_excel
to_excel.py:1:0: C0114: Missing module docstring (missing-module-docstring)
to_excel.py:4:0: C0116: Missing function or method docstring (missing-function-docstring)
to_excel.py:10:0: C0116: Missing function or method docstring (missing-function-docstring)
to_excel.py:24:0: C0116: Missing function or method docstring (missing-function-docstring)
to_excel.py:30:0: C0116: Missing function or method docstring (missing-function-docstring)
-----------------------------------
Your code has been rated at 8.21/10
中间修改了很多次最后勉强从8.21->9.23
(.venv) PS E:\Crawler> pylint E:\Crawler\code\a_wordcloud.py E:\Crawler\code\b_wordcloud.py E:\Crawler\code\bvid.py E:\Crawler\code\to_allexcel.py E:\Crawler\code\to_danmu.py E:\Crawler\code\to_excel.py
************* Module bvid
code\bvid.py:27:15: W0621: Redefining name 'page_number' from outer scope (line 52) (redefined-outer-name)
code\bvid.py:40:11: W0621: Redefining name 'source_code' from outer scope (line 53) (redefined-outer-name)
code\bvid.py:44:12: W0621: Redefining name 'bv_list' from outer scope (line 54) (redefined-outer-name)
code\bvid.py:51:4: C0103: Constant name "bv_count" doesn't conform to UPPER_CASE naming style (invalid-name)
************* Module to_danmu
code\to_danmu.py:33:19: W0621: Redefining name 'bv_numbers' from outer scope (line 58) (redefined-outer-name)
code\to_danmu.py:35:4: W0621: Redefining name 'video_cids' from outer scope (line 59) (redefined-outer-name)
code\to_danmu.py:43:22: W0621: Redefining name 'output_file' from outer scope (line 56) (redefined-outer-name)
code\to_danmu.py:55:4: C0103: Constant name "bv_file_path" doesn't conform to UPPER_CASE naming style (invalid-name)
code\to_danmu.py:56:4: C0103: Constant name "output_file" doesn't conform to UPPER_CASE naming style (invalid-name)
************* Module to_excel
code\to_excel.py:1:0: R0801: Similar lines in 2 files
==bvid:[7:28]
==to_danmu:[8:29]
HEADERS = {
'authority': 'api.bilibili.com',
'accept': 'application/json, text/plain, */*',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'cookie': '...',
'origin': 'https://www.bilibili.com',
'referer': 'https://space.bilibili.com/1760559884?spm_id_from=333.788.0.0',
'sec-ch-ua': '"Not)A;Brand";v="99", "Microsoft Edge";v="127", "Chromium";v="127"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-origin',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0'
}
------------------------------------------------------------------
Your code has been rated at 9.23/10
(.venv) PS E:\Crawler> pylint E:\Crawler\code\a_wordcloud.py E:\Crawler\code\bvid.py E:\Crawler\code\to_allexcel.py
E:\Crawler\code\to_danmu.py E:\Crawler\code\to_excel.py
************* Module to_danmu
code\to_danmu.py:38:4: C0103: Variable name "BV_FILE_PATH" doesn't conform to snake_case naming style (invalid-name)
code\to_danmu.py:39:4: C0103: Variable name "DANMU_OUTPUT_FILE" doesn't conform to snake_case naming style (invalid-name)
------------------------------------------------------------------
Your code has been rated at 9.84/10 (previous run: 9.59/10, +0.24)
这个时候是最兴奋的,觉得胜利在望
-------------------------------------------------------------------
Your code has been rated at 10.00/10 (previous run: 9.84/10, +0.16)
再改了两次后成功了!
---
以下是关于提升过程的总结
# 8.21->10
## 一、关键问题与解决方法
### **问题1命名风格不符合PEP 8规范**
**警告**
```python
code\to_danmu.py:38:4: C0103: Variable name "BV_FILE_PATH" doesn't conform to snake_case naming style (invalid-name)
```
**解决方法**
将常量命名风格改为符合PEP 8规范的小写加下划线的`snake_case`,如将`BV_FILE_PATH`改为`bv_file_path`。
**思考**
PEP 8建议变量名使用`snake_case`风格,常量名使用`UPPER_CASE`风格。遵循这些命名规范能让代码更具可读性和可维护性,也能让团队成员更容易理解和接手代码。
---
### **问题2未使用的参数**
**警告**
```python
code\a_wordcloud.py:10:20: W0613: Unused argument 'word' (unused-argument)
```
**解决方法**
删除未使用的参数,或者将参数名改为`_`,以表明这个参数未被使用。
**思考**
保留未使用的参数可能会使代码显得冗余和混乱,清理这些无用的部分不仅能减少误解,还能提高代码的简洁性和可维护性。
---
### **问题3变量重定义**
**警告**
```python
code\bvid.py:9:15: W0621: Redefining name 'page_number' from outer scope (line 34) (redefined-outer-name)
```
**解决方法**
通过更改变量名称以避免在不同作用域中重定义变量,例如将`page_number`改为`page_num`。
**思考**
在不同作用域中使用相同的变量名可能会导致代码逻辑混乱,特别是在复杂的函数或循环中。为每个作用域赋予独特的变量名,能避免潜在的逻辑错误和混淆。
---
### **问题4重复代码**
**警告**
```python
code\to_excel.py:1:0: R0801: Similar lines in 2 files
```
**解决方法**
将重复的代码片段提取到一个独立的函数或模块中然后在需要的地方调用它。例如将重复使用的HTTP请求头封装到一个独立的模块中并在所有相关文件中引用。
**思考**
重复代码不仅增加了维护的复杂性,还可能导致不同部分的代码逻辑不一致。通过提取公共部分,能够使代码更加模块化、清晰且易于维护。
---
### **问题5行长度超过限制**
**警告**
```python
code\a_wordcloud.py:26:0: C0301: Line too long (135/100) (line-too-long)
```
**解决方法**
将超长的代码行进行拆分或者使用字符串格式化如f-string来将行长度控制在100字符以内。
**思考**
控制代码行的长度有助于提高代码的可读性,特别是在显示器宽度有限或代码被打印出来时。保持代码简洁清晰,可以减少阅读和理解的时间。
---
### **问题6未使用的导入**
**警告**
```python
code\b_wordcloud.py:8:0: C0411: standard import "re" should be placed before third party imports "pandas", "numpy", "jieba.analyse" (wrong-import-order)
```
**解决方法**
按PEP 8的建议调整导入顺序将标准库的导入放在文件的顶部第三方库放在其后并删除未使用的导入。
**思考**
按规范组织导入可以提高代码的整洁性和可读性,还可以避免未使用导入对程序产生不必要的影响。保持代码的简洁和结构化有助于减少潜在的错误。
## 总结
- **命名规范**遵循PEP 8的命名规范能显著提高代码的可读性和一致性尤其在多人协作时尤为重要。
- **参数与变量**:清理未使用的参数和避免变量重定义,能防止不必要的混淆和错误。
- **代码重复**:通过提取公共部分来减少代码重复,使代码更加模块化和易于维护。
- **行长度控制**:控制行长度有助于提高代码的可读性,尤其是在显示器或打印输出有限的情况下。
- **导入顺序**:按规范组织导入,提高代码的整洁性,减少潜在的冲突和错误。

@ -0,0 +1,48 @@
"""
生成基于弹幕数据的词云图
"""
import pandas as pd
import numpy as np
import wordcloud
from matplotlib.image import imread
import jieba
from concurrent.futures import ThreadPoolExecutor
def blue_color_func(_random_state=None, **_kwargs):
"""Generates a color in the HSL format with a random lightness value."""
return f"hsl(210, 100%, {np.random.randint(50, 90)}%)"
def process_text(danmu_list):
"""并行处理弹幕文本,进行分词等操作"""
with ThreadPoolExecutor() as executor:
dm_string = ' '.join(executor.map(lambda text: ' '.join(jieba.lcut(text)), danmu_list))
return dm_string
def wordcloud_generation(danmu_data, stopwords, output_path):
"""生成词云图并保存"""
dm_list = danmu_data['danmu'].dropna().astype(str).tolist()
dmreal_string = process_text(dm_list)
img = imread("/output/OIP.jpg")
wc = wordcloud.WordCloud(
stopwords=stopwords,
width=1920,
height=1200,
background_color='white',
font_path='msyhl.ttc',
mask=img,
max_words=100,
color_func=blue_color_func,
).generate(dmreal_string)
wc.to_file(output_path)
def main():
"""加载数据并生成词云"""
dm = pd.read_excel('E:/Crawler/output/Top8_Danmu.xlsx', sheet_name='Sheet1')
stopwords = {'', '', '', '', '', '', '', '', '', '', '', '', '', '', ''}
wordcloud_generation(dm, stopwords, '/output/danmu_dwordcloud.png')
print("词云图生成完成!")
if __name__ == '__main__':
main()

@ -0,0 +1,55 @@
"""
生成基于全部弹幕数据的词云图并进行关键词提取和归一化处理
"""
import re
import pandas as pd
import numpy as np
import wordcloud
from matplotlib.image import imread
from jieba import analyse
from concurrent.futures import ThreadPoolExecutor
def blue_color_func(_, __, ___, ____, _random_state=None, **_kwargs):
"""定义蓝色调色板,用于词云图的颜色设置"""
return f"hsl(210, 100%, {np.random.randint(50, 90)}%)"
def normalize_hahaha(text):
"""归一化处理,将所有类似的“哈哈哈”统一为“哈哈哈”"""
return re.sub(r'{3,}', '哈哈哈', text)
def process_keywords(dm_list):
"""并行处理关键词提取"""
dm_string = ' '.join(dm_list)
with ThreadPoolExecutor() as executor:
keywords = list(executor.map(lambda kw: analyse.extract_tags(kw, topK=100, withWeight=False, allowPOS=()), [dm_string]))
return ' '.join(keywords[0])
def wordcloud_generation(danmu_data, stopwords, output_path):
"""生成词云图并保存"""
dm_list = danmu_data['danmu'].dropna().astype(str).tolist()
dm_list = [normalize_hahaha(text) for text in dm_list]
dmreal_string = process_keywords(dm_list)
img = imread("/output/OIP.jpg")
wc = wordcloud.WordCloud(
stopwords=stopwords,
width=1920,
height=1200,
background_color='white',
font_path='msyhl.ttc',
mask=img,
max_words=100,
color_func=blue_color_func,
).generate(dmreal_string)
wc.to_file(output_path)
def main():
"""加载数据并生成词云"""
dm = pd.read_excel('E:/Crawler/output/All_Danmu.xlsx', sheet_name='Sheet1')
stopwords = {'', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '哈哈哈'}
wordcloud_generation(dm, stopwords, '/output/alldanmu_dwordcloud.png')
if __name__ == '__main__':
main()

@ -0,0 +1,54 @@
"""
从B站搜索结果中提取视频的BV号并将其保存到文件中
"""
import re
import requests
from common_headers import HEADERS
from concurrent.futures import ThreadPoolExecutor
def get_source(page_num):
"""获取B站搜索结果页的源码"""
get_url = (
f'https://api.bilibili.com/x/web-interface'
f'/wbi/search/type?__refresh__=true&_extra=&'
f'context=&page={page_num}&page_size=42&from_source=&from_spmid=333.337&'
'platform=pc&highlight=1&single_column=0&keyword=2024巴黎奥运会'
'&qv_id=zaOudcC1LJI0GehR81nuNQEKktKQ2aP1&ad_resource=5654'
'&source_tag=3&gaia_vtoken=&category_id=&search_type=video'
)
response = requests.get(url=get_url, headers=HEADERS, timeout=10)
return response.text
def extract_bv(source_html):
"""从搜索结果的HTML源码中提取BV号"""
return re.findall('"bvid":"(.*?)","title":".*?', source_html)
def save_bv_to_file(bv_list):
"""将BV号保存到文件中"""
with open('/output/bv_numbers.txt', 'a', encoding='utf-8') as f:
for bv in bv_list:
f.write(bv + '\n')
def process_pages(page_range):
"""并行处理多个页面"""
with ThreadPoolExecutor() as executor:
results = list(executor.map(get_source, page_range))
return results
def main():
"""主函数循环获取多页BV号并保存"""
counter = 0
page_range = range(1, 9)
html_sources = process_pages(page_range)
for html_source in html_sources:
bvs = extract_bv(html_source)
save_bv_to_file(bvs)
counter += len(bvs)
if counter >= 300:
break
print("BV号收集完成")
if __name__ == '__main__':
main()

@ -0,0 +1,22 @@
"""
定义通用的HTTP请求头
"""
HEADERS = {
'authority': 'api.bilibili.com',
'accept': 'application/json, text/plain, */*',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'cookie': '...',
'origin': 'https://www.bilibili.com',
'referer': 'https://space.bilibili.com/1760559884?spm_id_from=333.788.0.0',
'sec-ch-ua': '"Not)A;Brand";v="99", "Microsoft Edge";v="127", "Chromium";v="127"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-origin',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0'
}

File diff suppressed because it is too large Load Diff

@ -0,0 +1,29 @@
import cProfile
import pstats
import a_wordcloud
import bvid
import to_allexcel
import to_danmu
import to_excel
def run_all():
bvid.main()
to_danmu.main()
to_allexcel.main()
to_excel.main()
a_wordcloud.main()
if __name__ == '__main__':
profiler = cProfile.Profile()
profiler.enable()
run_all()
profiler.disable()
profiler.dump_stats('performance_profile.prof')
# 分析结果
with open('performance_report.txt', 'w') as f:
ps = pstats.Stats(profiler, stream=f)
ps.sort_stats('cumulative')
ps.print_stats()

@ -0,0 +1,77 @@
"""
读取弹幕数据并统计频率然后将统计结果保存到Excel文件中
"""
import pandas as pd
def load_danmu(file_path):
"""从文件中读取弹幕数据"""
with open(file_path, 'r', encoding='utf-8') as f:
return f.readlines()
"""
读取弹幕数据并统计频率然后将统计结果保存到Excel文件中
"""
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
def load_danmu(file_path):
"""从文件中读取弹幕数据"""
with open(file_path, 'r', encoding='utf-8') as f:
return f.readlines()
def count_danmu(danmu_list):
"""统计每条弹幕出现的次数"""
all_danmus = {}
with ThreadPoolExecutor() as executor:
for danmu in executor.map(str.strip, danmu_list):
all_danmus[danmu] = all_danmus.get(danmu, 0) + 1
return all_danmus
def save_to_excel(all_danmus, excel_file):
"""将弹幕频率统计结果保存到Excel文件中"""
sorted_danmus = sorted(all_danmus.items(), key=lambda x: x[1], reverse=True)
df = pd.DataFrame(sorted_danmus, columns=['danmu', 'count'])
df.to_excel(excel_file, index=False)
def main():
"""读取弹幕数据、统计频率并保存到Excel"""
danmu_file_path = '/output/danmu.txt'
excel_file = '/output/All_Danmu.xlsx'
danmu_list = load_danmu(danmu_file_path)
all_danmus = count_danmu(danmu_list)
save_to_excel(all_danmus, excel_file)
print("所有弹幕数据统计完成并已保存到Excel表格")
if __name__ == '__main__':
main()
def count_danmu(danmu_list):
"""统计每条弹幕出现的次数"""
all_danmus = {}
for danmu in danmu_list:
danmu = danmu.strip()
all_danmus[danmu] = all_danmus.get(danmu, 0) + 1
return all_danmus
def save_to_excel(all_danmus, excel_file):
"""将弹幕频率统计结果保存到Excel文件中"""
sorted_danmus = sorted(all_danmus.items(), key=lambda x: x[1], reverse=True)
df = pd.DataFrame(sorted_danmus, columns=['danmu', 'count'])
df.to_excel(excel_file, index=False)
def main():
"""读取弹幕数据、统计频率并保存到Excel"""
danmu_file_path = '/output/danmu.txt'
excel_file = '/output/All_Danmu.xlsx'
danmu_list = load_danmu(danmu_file_path)
all_danmus = count_danmu(danmu_list)
save_to_excel(all_danmus, excel_file)
print("所有弹幕数据统计完成并已保存到Excel表格")
if __name__ == '__main__':
main()

@ -0,0 +1,49 @@
"""
从B站获取视频的弹幕并保存到文件中
"""
import re
import json
import requests
from common_headers import HEADERS
from concurrent.futures import ThreadPoolExecutor
def load_bv_numbers(file_path):
"""从文件中读取BV号"""
with open(file_path, 'r', encoding='utf-8') as f:
return [line.strip() for line in f.readlines()]
def fetch_video_cids(bv_list):
"""获取视频的CID号"""
cid_list = []
with ThreadPoolExecutor() as executor:
results = list(executor.map(lambda bv: requests.get(f'https://api.bilibili.com/x/player/pagelist?bvid={bv}&jsonp=jsonp', headers=HEADERS, timeout=10), bv_list))
for response in results:
cid = json.loads(response.text)['data'][0]['cid']
cid_list.append(cid)
return cid_list
def fetch_and_save_danmu(cid_list, danmu_file):
"""爬取视频弹幕并保存到文件"""
with ThreadPoolExecutor() as executor:
results = list(executor.map(lambda cid: requests.get(f'https://api.bilibili.com/x/v1/dm/list.so?oid={cid}', headers=HEADERS, timeout=10), cid_list))
for response in results:
response.encoding = response.apparent_encoding
data_list = re.findall('<d p=".*?">(.*?)</d>', response.text)
with open(danmu_file, mode='a', encoding='utf-8') as f:
for danmu in data_list:
f.write(danmu + '\n')
def main():
"""主函数从BV号中获取CID并爬取弹幕"""
bv_file_path = '/output/bv_numbers.txt'
danmu_output_file = '/output/danmu.txt'
bv_numbers = load_bv_numbers(bv_file_path)
cids = fetch_video_cids(bv_numbers)
fetch_and_save_danmu(cids, danmu_output_file)
print("弹幕数据爬取完成")
if __name__ == '__main__':
main()

@ -0,0 +1,41 @@
"""
统计AI相关的弹幕数据并将前8项结果保存到Excel文件中
"""
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
def load_danmu(file_path):
"""从文件中读取弹幕数据"""
with open(file_path, 'r', encoding='utf-8') as f:
return f.readlines()
def filter_and_count_danmu(danmu_list):
"""统计AI相关的弹幕频率"""
all_danmus = {}
ai_keywords = ['ai', '智能', '技术', '应用', '人机', 'AI', '人工智能', '机器学习', '深度学习', '神经网络']
with ThreadPoolExecutor() as executor:
for danmu in executor.map(str.strip, danmu_list):
if any(keyword in danmu for keyword in ai_keywords):
all_danmus[danmu] = all_danmus.get(danmu, 0) + 1
return all_danmus
def save_to_excel(all_danmus, excel_file):
"""将统计的AI相关弹幕保存到Excel文件中"""
sorted_danmus = sorted(all_danmus.items(), key=lambda x: x[1], reverse=True)[:8]
df = pd.DataFrame(sorted_danmus, columns=['danmu', 'count'])
df.to_excel(excel_file, index=False)
def main():
"""读取弹幕数据、统计AI相关弹幕并保存到Excel"""
danmu_file_path = '/output/danmu.txt'
excel_file = '/output/Top8_Danmu.xlsx'
danmu_list = load_danmu(danmu_file_path)
all_danmus = filter_and_count_danmu(danmu_list)
save_to_excel(all_danmus, excel_file)
print("与AI相关的弹幕数据统计完成并已保存到Excel表格")
if __name__ == '__main__':
main()

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 17 KiB

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 22 KiB

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

Binary file not shown.

After

Width:  |  Height:  |  Size: 13 KiB

Binary file not shown.
Loading…
Cancel
Save